In [14]:
import pandas as pd
import tensorflow as tf
import os
import tensorflow_text as tf_text
from nltk.corpus import stopwords
import gensim
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import pickle

import tensorflow as tf

from tensorflow.keras.layers import Dense, InputLayer, GlobalMaxPool1D, Dropout, Conv1D, MaxPool1D, Flatten, Embedding, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.activations import relu, sigmoid
from tensorflow.keras.optimizers import Adam, SGD

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [15]:
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10 
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

In [16]:
def first_data_prep():
    data = pd.read_csv('Data/twitter_data.csv', 
                      encoding='latin-1', names=['sentiment', 'id', 'date', 'flag', 'user',
                                                'text'])
    data['sentiment'].replace(4, 1, inplace=True)
    data = data.sample(frac=1).reset_index(drop=True).copy()
    data['text'] = data['text'].str.lower()
    return data

In [17]:
def delete_nicknames(row):
    # Prepare list of words
    words = row.split()
    # Remove nicknames
    for word in words:
        if word[0] == '@':
            words.remove(word)
    # Return string 
    return ' '.join(word for word in words)

In [18]:
def stratified_split(df, size):
    # Split dataset into smaller one 
    col_list = list(df.columns)
    # Drop target column name
    col_list.pop(0)
    x_train, x_valid = train_test_split(
    df, random_state=1, stratify=df['sentiment'], test_size=size)
    # Prepare new indexes 
    x_valid.reset_index(drop=True, inplace=True)
    return x_valid 
    

In [19]:
def replace_with_space(text):
    # Replace new rows with space 
    text = text.replace('\n', " ").replace("\r", " ")
    # Create list of all not needed chars 
    punc_list = '!"@#$%^&*()+_-.<>?/:;[]{}|\~'
    # Make transformation with dict that contains punc_list chars
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    # Apply transformation
    text = text.translate(t)
    # Replace single quote with empty char
    t = text.maketrans(dict.fromkeys("'`"))
    text.translate(t)
    
    return text
    

In [20]:
def remove_stop_words(text):
    # Prepare set of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Remove stopwords from the text
    filtered_text = [word for word in text.split() if not word in stop_words]
    
    return filtered_text

In [36]:
def tokenize(): 
    data1 = first_data_prep()
    data = stratified_split(data1, 0.1)
    # Apply replace func that replace chars with spaces
    data.loc[:, 'text'] = data['text'].apply(lambda x: replace_with_space(x))
    # Apply func that removes stop words
    data.loc[:, 'text'] = data['text'].apply(lambda x: remove_stop_words(x))
    w2v_model = gensim.models.word2vec.Word2Vec(vector_size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)
    
    documents = data.text
    w2v_model.build_vocab(documents)
    
    tok = tf.keras.preprocessing.text.Tokenizer(num_words=100000)
    # Updates internal vocabulary based on a list of texts 
    tok.fit_on_texts(list(data['text']))
    # Transforms each text in texts to a sequence of integers.
    seq = tok.texts_to_sequences(list(data['text']))
    # Pad sequences to make them same lenght 
    tf_ready = tf.keras.preprocessing.sequence.pad_sequences(seq)
    
    words = w2v_model.wv.index_to_key
    vocab_size = len(tok.word_index) + 1
    
    w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)
    
    embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
    for word, i in tok.word_index.items():
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]
    embedding_layer = Embedding(vocab_size, W2V_SIZE,
                            weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)
    
    tf_df = pd.DataFrame(tf_ready)
    tf_df['sentiment'] = data['sentiment']
    
    return tf_df, tok, embedding_layer

In [37]:
def pickle_data():
    with open('tok.pkl', 'wb') as f:
        pickle.dump(tok, f)
        
    with open('embedding.pkl', 'wb') as f:
        pickle.dump(embedding_layer, f)
        
    
    with open('tf_df.pkl', 'wb') as f:
        pickle.dump(tf_df, f)

In [38]:
def configure_cnn(data,
                embedding,
                layers=None, 
                dropout_rate=0,
                kernel_size=10,
                stride=10,
                pool_size=2,
                optimizer='Adam',
                loss='binary_crossentropy',
                 kernel_initializer='lecun_normal',
                 kernel_regularizer=tf.keras.regularizers.L2(0.01)
                ):
    """ Layers argument shape:
    [[number of nodes, activate function], 
    [number of nodes, activate function],
    ...]
    
    """
    input_len = data.shape[1] - 1 
    model = Sequential()
    model.add(embedding)
    model.add(Conv1D(32, 3, padding='same', activation='relu'))
    model.add(MaxPool1D(3))
    model.add(Conv1D(64, 3, activation='relu'))
    model.add(MaxPool1D(pool_size=3))
    model.add(Conv1D(128, 3, activation='relu'))
    model.add(GlobalMaxPool1D())
    model.add(Flatten())
    
    if dropout_rate > 0:
        model.add(Dropout(dropout_rate))
    
    if layers != None:
        for node in layers[1:]:
            model.add(Dense(node[0], activation=node[1], kernel_initializer=kernel_initializer, 
                            kernel_regularizer=kernel_regularizer))
    model.add(Dense(10, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid', kernel_initializer=kernel_initializer, 
                    kernel_regularizer=kernel_regularizer))
    
    model.compile(loss=loss, 
                 optimizer=optimizer,
                 metrics=['accuracy'])
    
    return model

In [39]:
def configure_rnn(data,
                embedding,
                layers=None, 
                dropout_rate=0,
                kernel_size=10,
                stride=10,
                pool_size=2,
                optimizer='Adam',
                loss='binary_crossentropy',
                 kernel_initializer='lecun_normal',
                 kernel_regularizer=tf.keras.regularizers.L2(0.01)
                ):
    """ Layers argument shape:
    [[number of nodes, activate function], 
    [number of nodes, activate function],
    ...]
    
    """
    input_len = data.shape[1] - 1 
    model = Sequential()
    model.add(embedding)
    model.add(Dropout(0.5))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))    

    if layers != None:
        for node in layers[1:]:
            model.add(Dense(node[0], activation=node[1], kernel_initializer=kernel_initializer, 
                            kernel_regularizer=kernel_regularizer))
    model.add(Dense(1, activation='sigmoid', kernel_initializer=kernel_initializer, 
                    kernel_regularizer=kernel_regularizer))
    
    model.compile(loss=loss, 
                 optimizer=optimizer,
                 metrics=['accuracy'])
    
    return model

In [41]:
tf_df, tok, embedding_layer = tokenize()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [42]:
model = configure_cnn(tf_df, embedding_layer)
model_rnn = configure_rnn(tf_df, embedding_layer)

In [26]:
features = tf_df.columns.tolist()
features.remove('sentiment')
X_train, X_test, y_train, y_test = train_test_split(tf_df[features], tf_df['sentiment'], test_size=0.33, random_state=42)

model_rnn.summary()

In [43]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 300)          44412300  
_________________________________________________________________
conv1d (Conv1D)              (None, 300, 32)           28832     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 100, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 64)            6208      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 32, 64)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 30, 128)           24704     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0

In [44]:
model_rnn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 300)          44412300  
_________________________________________________________________
dropout (Dropout)            (None, 300, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               160400    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 44,572,801
Trainable params: 160,501
Non-trainable params: 44,412,300
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, epochs=EPOCHS, verbose=1,
         validation_data=(X_test, y_test), batch_size=BATCH_SIZE)

In [None]:
model.save('cnn')

In [28]:
model_rnn.fit(X_train, y_train, epochs=EPOCHS, verbose=1,
         validation_data=(X_test, y_test), batch_size=BATCH_SIZE)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f14df780fa0>

In [29]:
model_rnn.save('rnn')

INFO:tensorflow:Assets written to: rnn/assets


In [12]:
names = pd.read_csv('Data/IMDb movies.csv')
ratings = pd.read_csv('Data/IMDb ratings.csv')

In [15]:
full = names.merge(ratings, on='imdb_title_id')
full.loc[:, 'year'] = full['year'].apply(lambda x: to_int(x))

In [67]:
def to_int(row):
    try: 
        return float(row)
    except:
        return np.NaN

def prepare_reviews(row):
    try:
        if int(row) < 10:
            return row * 10
        elif int(row) > 100:
            return int(row)/10 
        else:
            return int(row)
    except:
        return np.NaN


In [67]:
last_data = full.loc[(full['country'].isin(['USA', 'Canada'])) & (full['year'] > 2015)]
last_data.loc[:, 'reviews_from_users'] = last_data['reviews_from_users'].apply(lambda x: prepare_reviews(x))
df_ready = last_data[['original_title', 'reviews_from_users']]
df_ready.to_csv('Data/titles_with_reviews.csv')
df_ready['preds'] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [73]:
def evaluate_model():
    data = pd.read_csv('preds.csv')
    data = data.dropna(subset=['preds'])
    data = data.drop_duplicates(subset='original_title').loc[:, ['original_title', 'reviews_from_users', 'preds', 'preds_diff']]
    data['preds'] = data['preds'].apply(lambda x: to_int(x))
    data['reviews_from_users'] = data['reviews_from_users'].apply(lambda x: to_int(x))
    data['preds_diff'] = data['preds'] - data['reviews_from_users']
    data['preds_diff'] = data['preds_diff'].apply(np.absolute)
    data = data.loc[(data['preds'] < 100) & (data['reviews_from_users'] < 100)]

In [75]:
data['preds_diff'].mean()

25.574552697740156

In [77]:
data

Unnamed: 0,original_title,reviews_from_users,preds,preds_diff
1,'77,40.0,50.268333,10.268333
2,The Evil Within,46.0,39.716360,6.283640
3,Fahrenheit 451,20.6,53.770604,33.170604
4,Motherless Brooklyn,31.9,53.798568,21.898568
5,A Million Little Pieces,36.0,46.343595,10.343595
...,...,...,...,...
246057,Better Off Single,10.0,43.384030,33.384030
246058,The Tell-Tale Heart,20.0,61.978381,41.978381
246059,This Is Your Death,48.0,33.053585,14.946415
246060,Elvis & Nixon,93.0,73.561830,19.438170


In [86]:
['Analiza sentymentu z wykorzystaniem sieci neuronowych']

['Analiza sentymentu z wykorzystaniem sieci neuronowych']

In [87]:
['Analiza', 'sentymentu', 'z',  'wykorzystaniem',  'sieci',  'neuronowych']

['Analiza', 'sentymentu', 'z', 'wykorzystaniem', 'sieci', 'neuronowych']

In [88]:
[311, 123, 3, 1, 2525, 1311]

[311, 123, 3, 1, 2525, 1311]

In [89]:
[0, 0, 0, 0, 311, 123, 3, 1, 2525, 1311]

[0, 0, 0, 0, 311, 123, 3, 1, 2525, 1311]