In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('training_reviews.csv', names = ['Text', 'Label'], header = None)
df.head()

Unnamed: 0,Text,Label
0,"As a popular sport, surfing was liked by many ...",1
1,'The Rookie' was a wonderful movie about the s...,1
2,I just came back from a pre-release viewing of...,1
3,I had a personal interest in this movie. When ...,1
4,This movie has so many wonderful elements to i...,1


In [3]:
import re
import nltk

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text


def clean_up_text(text, flag = 0):
    text_1 = preprocessor(text)
    text_2 = remove_special_characters(text_1)
    
    if flag == 1:
        text_2 = simple_stemmer(text_2)
        
    return text_2


# rev = df1.iloc[0]['Text']
# print(clean_up_text(rev)) ## call with no stemming
# print()
# print(clean_up_text(rev, 1)) ## call like this when stemming


#set flag = 1, if you want to stem
def process_data(data):

    for i in range(len(data)):
        #print(data[i])
        #print(data['Text'][i])
        data['Text'][i] = clean_up_text(data['Text'][i])
        
    return data



df = process_data(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
from sklearn.model_selection import train_test_split

X = df.iloc[:, 0].values
Y = df.iloc[:, 1].values
# df1 = df[df.Label == 2]
# df2 = df[df.Label == 1]

#### DO Nueral_NEt stuff here
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 0)


In [41]:
######### Getting Feature Sets from Lexicon

from nrclex import NRCLex

#Feature Set, is ordered by Length of review, followed by raw_emotio
#Foallaowed by affect_frequencies

X_2 = []
Y_2 = []

for i in range(len(df)):
    if i% 1000 == 1:
        print(i)
        
    review = df['Text'][i]
    
    text_object = NRCLex(review)
    review = review.split(" ")
    
    feature_list = []
    feature_list.append(len(review))
    
    raw_emotions = ["anger", "anticipation", "disgust", "fear", "joy", "negative", "positive", "sadness", "surprise", "trust"]
    
    raw_emotion = text_object.raw_emotion_scores
    for raw_em in raw_emotions:
        if raw_em not in raw_emotion:
            raw_emotion[raw_em] = 0
    raw_emotion = [(k,v) for k,v in raw_emotion.items()]
    raw_emotion.sort()
    if(i < 10):
        print(raw_emotion)
    raw_emotion = [v for (k,v) in raw_emotion]
    feature_list.extend(raw_emotion)
    
#     affect_frequencies = text_object.affect_frequencies
#     affect_frequencies = [(k,v) for k,v in affect_frequencies.items()]
#     affect_frequencies.sort()
#     affect_frequencies = [v for (k,v) in affect_frequencies]
#     feature_list.extend(affect_frequencies)
    
    X_2.append(feature_list)
    Y_2.append(df['Label'][i])
    
    
print(X_2[0:5], Y_2[0:5])


[('anger', 1), ('anticipation', 0), ('disgust', 0), ('fear', 2), ('joy', 1), ('negative', 3), ('positive', 1), ('sadness', 0), ('surprise', 0), ('trust', 2)]
1
[('anger', 0), ('anticipation', 4), ('disgust', 0), ('fear', 0), ('joy', 4), ('negative', 0), ('positive', 5), ('sadness', 0), ('surprise', 2), ('trust', 4)]
[('anger', 1), ('anticipation', 2), ('disgust', 1), ('fear', 1), ('joy', 3), ('negative', 4), ('positive', 7), ('sadness', 3), ('surprise', 1), ('trust', 5)]
[('anger', 2), ('anticipation', 6), ('disgust', 2), ('fear', 2), ('joy', 7), ('negative', 5), ('positive', 13), ('sadness', 2), ('surprise', 2), ('trust', 13)]
[('anger', 1), ('anticipation', 4), ('disgust', 0), ('fear', 1), ('joy', 8), ('negative', 3), ('positive', 10), ('sadness', 3), ('surprise', 6), ('trust', 5)]
[('anger', 21), ('anticipation', 26), ('disgust', 18), ('fear', 30), ('joy', 22), ('negative', 44), ('positive', 60), ('sadness', 20), ('surprise', 19), ('trust', 33)]
[('anger', 2), ('anticipation', 5), (

In [42]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame

### number of features
feature_len = len(X_2[0])
label_list = ["Feature_" + str(i) for i in range(1, feature_len + 1)]

print(label_list)
X_2 = DataFrame(X_2, columns = label_list)
Y_2 = DataFrame(Y_2, columns = ["Label"])



X_train, X_test, Y_train, Y_test = train_test_split(X_2,Y_2, test_size = 0.2, random_state = 0)



sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)

#print(Y_pred)


# print(confusion_matrix(Y_test,Y_pred))
# print(classification_report(Y_test,Y_pred))
#print(accuracy_score(Y_test, Y_pred))

['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10', 'Feature_11']




In [45]:
# print(max(Y_pred))
# print(min(Y_pred))
# print(Y_pred[0:100])
# print(Y_train[0:100])
print(Y_pred[0:100])
Y_pred = [y.round() for y in Y_pred]
# print(Y_pred[0:100])
print(Y_pred[0:100])
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))
print(accuracy_score(Y_test, Y_pred))

[2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0]
[2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0

In [48]:
from sklearn.ensemble import RandomForestClassifier
RSEED = 50

train, test, train_labels, test_labels = train_test_split(X_2, Y_2, 
                                                          stratify = Y_2,
                                                          test_size = 0.2, 
                                                          random_state = RSEED)
# X_2 = DataFrame(X_2, columns = label_list)
# Y_2 = DataFrame(Y_2, columns = ["Label"])



# X_train, X_test, Y_train, Y_test = train_test_split(X_2,Y_2, test_size = 0.2, random_state = 0)



model = RandomForestClassifier(n_estimators=100, 
                               random_state=RSEED, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1)

model.fit(train, train_labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished


RandomForestClassifier(max_features='sqrt', n_jobs=-1, random_state=50,
                       verbose=1)

In [50]:
import numpy as np 

n_nodes = []
max_depths = []

for ind_tree in model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 8893
Average maximum depth 29


In [51]:
train_rf_predictions = model.predict(train)
train_rf_probs = model.predict_proba(train)[:, 1]

rf_predictions = model.predict(test)
rf_probs = model.predict_proba(test)[:, 1]



[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished


In [54]:
print(accuracy_score(test_labels, rf_predictions))

0.694
