In [4]:
import pandas as pd

In [5]:
df_training = pd.read_csv('training_reviews.csv', names = ['Text', 'Label'], header = None)
df_testing = pd.read_csv('testing_set.csv', names = ['Text', 'Label'], header = None)

In [10]:
import re
import nltk

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text


def clean_up_text(text, flag = 0):
    text_1 = preprocessor(text)
    text_2 = remove_special_characters(text_1)
    
    if flag == 1:
        text_2 = simple_stemmer(text_2)
        
    return text_2

# rev = df1.iloc[0]['Text']
# print(clean_up_text(rev)) ## call with no stemming
# print()
# print(clean_up_text(rev, 1)) ## call like this when stemming


#set flag = 1, if you want to stem
def process_data(data, flag = 0):

    for i in range(len(data)):
        if i%1000 == 0:
            print(i)
        data['Text'][i] = clean_up_text(data['Text'][i], flag)
        
    return data



df_training = process_data(df_training)
df_testing = process_data(df_testing)
df_training_stemmed = process_data(df_training, flag = 1)
df_testing_stemmed = process_data(df_testing, flag = 1)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



20000
21000
22000
23000
24000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000


In [14]:
from nrclex import NRCLex
from pandas import DataFrame
from sklearn.model_selection import train_test_split

#Feature Set, is ordered by Length of review, followed by raw_emotio
#Foallaowed by affect_frequencies


def generate_features_via_lexicon(df):
    
    X_2 = []
    Y_2 = []



    for i in range(len(df)):
        if i% 1000 == 1:
            print(i)

        review = df['Text'][i]

        text_object = NRCLex(review)
        review = review.split(" ")

        feature_list = []
        feature_list.append(len(review))

        raw_emotions = ["anger", "anticipation", "disgust", "fear", "joy", "negative", "positive", "sadness", "surprise", "trust"]
        affect_freq = raw_emotions = ["anger", "anticip", "anticipation", "disgust", "fear", "joy", "negative", "positive", "sadness", "surprise", "trust"]


        raw_emotion = text_object.raw_emotion_scores
        for raw_em in raw_emotions:
            if raw_em not in raw_emotion:
                raw_emotion[raw_em] = 0
        raw_emotion = [(k,v) for k,v in raw_emotion.items()]
        raw_emotion.sort()
        #if(i < 10):
            #print(raw_emotion)
        raw_emotion = [v for (k,v) in raw_emotion]
        feature_list.extend(raw_emotion)

        affect_frequencies = text_object.affect_frequencies
        for af in affect_freq:
            if af not in affect_frequencies:
                affect_frequencies[af] = 0
        affect_frequencies = [(k,v) for k,v in affect_frequencies.items()]
        affect_frequencies.sort()
        affect_frequencies = [v for (k,v) in affect_frequencies]
        feature_list.extend(affect_frequencies)

        X_2.append(feature_list)
        Y_2.append(df['Label'][i])
        
    feature_len = len(X_2[0])
    label_list = ["Feature_" + str(i) for i in range(1, feature_len + 1)]

    X_2 = DataFrame(X_2, columns = label_list)
    Y_2 = DataFrame(Y_2, columns = ["Label"])

    return X_2, Y_2


X1, Y1 = generate_features_via_lexicon(df_training)
X2, Y2 = generate_features_via_lexicon(df_training_stemmed)


X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1,Y1, test_size = 0.2, random_state = 0)
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2,Y2, test_size = 0.2, random_state = 0)

1
1001
2001
3001
4001
5001
6001
7001
8001
9001
10001
11001
12001
13001
14001
15001
16001
17001
18001
19001
20001
21001
22001
23001
24001
1
1001
2001
3001
4001
5001
6001
7001
8001
9001
10001
11001
12001
13001
14001
15001
16001
17001
18001
19001
20001
21001
22001
23001
24001


In [22]:
X1_Test, Y1_Test = generate_features_via_lexicon(df_testing)
X2_Test, Y2_Test = generate_features_via_lexicon(df_testing_stemmed)

1
1001
2001
3001
4001
5001
6001
7001
8001
9001
10001
11001
12001
13001
14001
15001
16001
17001
18001
19001
20001
21001
22001
23001
24001
1
1001
2001
3001
4001
5001
6001
7001
8001
9001
10001
11001
12001
13001
14001
15001
16001
17001
18001
19001
20001
21001
22001
23001
24001


In [26]:
##Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X1_train, Y1_train)
print("score on validation: " + str(lr.score(X1_test, Y1_test)))
print("score on test: "+ str(lr.score(X1_Test, Y1_Test)))

lr = LogisticRegression(max_iter=1000)
lr.fit(X2_train, Y2_train)
print("score on validation: " + str(lr.score(X2_test, Y2_test)))
print("score on test: "+ str(lr.score(X2_Test, Y2_Test)))

  return f(*args, **kwargs)


score on validation: 0.659
score on test: 0.6546


  return f(*args, **kwargs)


score on validation: 0.659
score on test: 0.6546


In [25]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X1_train, Y1_train)
print("score on validation: "  + str(clf.score(X1_test, Y1_test)))
print("score on Test: " + str(clf.score(X1_Test, Y1_Test)))

clf = DecisionTreeClassifier()
clf.fit(X2_train, Y2_train)
print("score on validation: "  + str(clf.score(X2_test, Y2_test)))
print("score on Test: " + str(clf.score(X2_Test, Y2_Test)))

score on validation: 0.5564
score on Test: 0.57404
score on validation: 0.562
score on Test: 0.57356


In [27]:
#Bagging Decision Tree

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# max_samples: maximum size 0.5=50% of each sample taken from the full dataset
# max_features: maximum of features 1=100% taken here all 10K 
# n_estimators: number of decision trees 
bg=BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5,max_features=1.0,n_estimators=10)
bg.fit(X1_train, Y1_train)
print("score on validation: " + str(bg.score(X1_test, Y1_test)))
print("score on Test: "+ str(bg.score(X1_Test, Y1_Test)))

bg=BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5,max_features=1.0,n_estimators=10)
bg.fit(X2_train, Y2_train)
print("score on validation: " + str(bg.score(X2_test, Y2_test)))
print("score on Test: "+ str(bg.score(X2_Test, Y2_Test)))

  return f(*args, **kwargs)


score on validation: 0.6122
score on Test: 0.6142


  return f(*args, **kwargs)


score on validation: 0.6126
score on Test: 0.61452


In [29]:
#Boosting Decision Tree

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

adb = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=4),n_estimators=10,learning_rate=0.6)
adb.fit(X1_train, Y1_train)
print("score on validation: " + str(adb.score(X1_test, Y1_test)))
print("score on Test: "+ str(adb.score(X1_Test, Y1_Test)))

adb.fit(X2_train, Y2_train)
print("score on validation: " + str(adb.score(X2_test, Y2_test)))
print("score on Test: "+ str(adb.score(X2_Test, Y2_Test)))

  return f(*args, **kwargs)


score on validation: 0.654
score on Test: 0.65172


  return f(*args, **kwargs)


score on validation: 0.654
score on Test: 0.65172


In [31]:
## Random Forest

from sklearn.ensemble import RandomForestClassifier
# n_estimators = number of decision trees
rf = RandomForestClassifier(n_estimators=30, max_depth=9)
rf.fit(X1_train, Y1_train)
print("score on Validation: " + str(rf.score(X1_test, Y1_test)))
print("score on Test: "+ str(rf.score(X1_Test, Y1_Test)))

rf.fit(X2_train, Y2_train)
print("score on Validation: " + str(rf.score(X2_test, Y2_test)))
print("score on Test: "+ str(rf.score(X2_Test, Y2_Test)))

  after removing the cwd from sys.path.


score on Validation: 0.6644
score on Test: 0.65384


  


score on Validation: 0.665
score on Test: 0.6526


In [35]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

def sampling(Y):
    ret_Y = []
    for y in Y:
        e = np.random.binomial(size =1, n=1, p = y-1)
        ret_Y.append(1 + e)
    return ret_Y

def rounding(Y):
    ret_Y = []
    for y in Y:
        ret_Y.append(y.round())
    return ret_Y

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X1_train, Y1_train)
Y_pred = regressor.predict(X1_test)
Y1_pred = regressor.predict(X1_Test)

print("score on validation (sampling):", accuracy_score(Y1_test, sampling(Y_pred)))
print("score on Test (sampling):", accuracy_score(Y1_Test, sampling(Y1_pred)))
print("score on validation (rounding):", accuracy_score(Y1_test, rounding(Y_pred)))
print("score on Test (rounding):", accuracy_score(Y1_Test, rounding(Y1_pred)))

regressor.fit(X2_train, Y2_train)
Y_pred_2 = regressor.predict(X2_test)
Y2_pred_2 = regressor.predict(X2_Test)

print("score on validation (sampling):", accuracy_score(Y2_test, sampling(Y_pred_2)))
print("score on Test (sampling):", accuracy_score(Y2_Test, sampling(Y2_pred_2)))
print("score on validation (rounding):", accuracy_score(Y2_test, rounding(Y_pred_2)))
print("score on Test (rounding):", accuracy_score(Y2_Test, rounding(Y2_pred_2)))



score on validation (sampling): 0.5748
score on Test (sampling): 0.57324
score on validation (rounding): 0.626
score on Test (rounding): 0.62668




score on validation (sampling): 0.569
score on Test (sampling): 0.56672
score on validation (rounding): 0.626
score on Test (rounding): 0.62668
