# Sentiment Classification of Park Reviews

In [1]:
import pandas as pd

In [2]:
parkReviews = pd.read_csv('ParkReviewsLang.csv', index_col=0)

In [4]:
parkReviews.shape

(41822, 11)

In [3]:
parkReviews.head()

Unnamed: 0,review_for,review_id,username,user_url,published,date_retrieved,num_stars,num_reviews,review_text,label,lang
0,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSUNpeGF6TTNnRRAB,Claudia,https://www.google.com/maps/contrib/1001449741...,7 months ago,2021-06-20 22:04:09.211296,4.0,107.0,One of the nicest entry points to this invitin...,1,en
1,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSURDOGEyMGpnRRAB,Nate Neel,https://www.google.com/maps/contrib/1121030547...,8 months ago,2021-06-20 22:04:09.212245,5.0,121.0,"Waterfront to fish or just relax, great place ...",1,en
2,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSUM4Nk9Ya3lnRRAB,Yucel Salimoglu,https://www.google.com/maps/contrib/1034180738...,11 months ago,2021-06-20 22:04:09.213178,4.0,79.0,Everything except the parking is good here.,1,en
3,Parc de la Capture-d'Ethan-Allen,ChZDSUhNMG9nS0VJQ0FnSUNVdWNUbE9REAE,COCO BEADZ,https://www.google.com/maps/contrib/1036060504...,a year ago,2021-06-20 22:04:09.214115,4.0,128.0,"Defenely the best park in Montreal East, Tetre...",1,en
4,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSUMwdHJDTm1nRRAB,Anna Maria Fiore,https://www.google.com/maps/contrib/1016779009...,a year ago,2021-06-20 22:04:09.215069,5.0,39.0,It's so peaceful and happy place near the water,1,en


In [6]:
frenchReviews = parkReviews[parkReviews['lang'] == 'fr']
frenchReviews.shape

(17149, 11)

In [12]:
frenchReviewsDf = frenchReviews.copy()
frenchReviewText = frenchReviews['review_text'].apply(lambda x: x.split('(Original)')[-1].strip())
frenchReviewsDf['french_text'] = frenchReviewText

In [13]:
frenchReviewsDf.head()

Unnamed: 0,review_for,review_id,username,user_url,published,date_retrieved,num_stars,num_reviews,review_text,label,lang,french_text
22684,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSUNxM3JIQjhBRRAB,Claude Gagnon,https://www.google.com/maps/contrib/1182846684...,a week ago,2021-06-20 22:04:09.230541,4.0,86.0,(Translated by Google) Very beautiful park to ...,1,fr,Tres beau parc pour faire un pinic et profiter...
22685,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSURLckpMTm5nRRAB,Guy Durand,https://www.google.com/maps/contrib/1056233036...,a month ago,2021-06-20 22:04:09.232736,5.0,53.0,(Translated by Google) The people are all very...,1,fr,Les gens sont tous très sociables.\nExceptionnel.
22686,Parc de la Capture-d'Ethan-Allen,ChZDSUhNMG9nS0VJQ0FnSUNBNktMclNREAE,Dania Pascual,https://www.google.com/maps/contrib/1064940459...,2 years ago,2021-06-20 22:04:09.233660,5.0,41.0,"(Translated by Google) Nice place to walk, jog...",1,fr,"Belle place pour marcher, jogger, promener le ..."
22687,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSUNxNWFEUzBBRRAB,Stéphane Lessard,https://www.google.com/maps/contrib/1061684432...,a week ago,2021-06-20 22:04:09.234580,4.0,7.0,(Translated by Google) Excellent food!\n\n(Ori...,1,fr,Nourriture excellente !
22688,Parc de la Capture-d'Ethan-Allen,ChZDSUhNMG9nS0VJQ0FnSUNLaTZ2YVFBEAE,Pitchou Kasongo,https://www.google.com/maps/contrib/1176531246...,2 months ago,2021-06-20 22:04:09.235577,4.0,93.0,(Translated by Google) I like to get some fres...,1,fr,J aime bien pour prendre de l air frais


## Calculating summary statistics 

In [14]:
import string

frenchReviewsDf['word_count'] = [len(review.split()) for review in frenchReviewsDf['french_text']]

frenchReviewsDf['uppercase_char_count'] = [sum(char.isupper() for char in review) \
                              for review in frenchReviewsDf['french_text']]                           

frenchReviewsDf['special_char_count'] = [sum(char in string.punctuation for char in review) \
                            for review in frenchReviewsDf['french_text']]       

In [15]:
pos_reviews = frenchReviewsDf[frenchReviewsDf['label'] == 1]
neg_reviews = frenchReviewsDf[frenchReviewsDf['label'] == 0]

After breaking down the dataset into positive and negative reviews 

In [16]:
pos_reviews['word_count'].describe()

count    14635.000000
mean        12.701537
std         16.127711
min          1.000000
25%          4.000000
50%          8.000000
75%         15.000000
max        474.000000
Name: word_count, dtype: float64

In [17]:
neg_reviews['word_count'].describe()

count    2514.000000
mean       16.305091
std        19.940490
min         1.000000
25%         5.000000
50%        10.000000
75%        20.000000
max       258.000000
Name: word_count, dtype: float64

In [18]:
14635/(14635 + 2514)

0.8534025307598111

Next we compare the number of uppercase letters used in the postive and negative reviews. As we can see there is no real difference between the two. 

In [19]:
pos_reviews['uppercase_char_count'].describe()

count    14635.000000
mean         1.819542
std          2.511317
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max        107.000000
Name: uppercase_char_count, dtype: float64

In [20]:
neg_reviews['uppercase_char_count'].describe()

count    2514.000000
mean        2.029037
std         3.391481
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        80.000000
Name: uppercase_char_count, dtype: float64

Finally let's take a look at the special characters present in the positive and negativve group of reviews.

In [21]:
pos_reviews['special_char_count'].describe()

count    14635.000000
mean         2.345268
std          3.710336
min          0.000000
25%          0.000000
50%          1.000000
75%          3.000000
max         94.000000
Name: special_char_count, dtype: float64

In [22]:
neg_reviews['special_char_count'].describe()

count    2514.000000
mean        2.945505
std         4.274390
min         0.000000
25%         0.000000
50%         2.000000
75%         4.000000
max        63.000000
Name: special_char_count, dtype: float64

## Examining the most frequent words

In [23]:
from collections import Counter

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreamock/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
def getMostCommonWords(reviews, n_most_common, stopwords=None):

    # flatten review column into a list of words, and set each to lowercase
    flattened_reviews = [word for review in reviews for word in \
                         review.lower().split()]


    # remove punctuation from reviews
    flattened_reviews = [''.join(char for char in review if \
                                 char not in string.punctuation) for \
                         review in flattened_reviews]


    # remove stopwords, if applicable
    if stopwords:
        flattened_reviews = [word for word in flattened_reviews if \
                             word not in stopwords]


    # remove any empty strings that were created by this process
    flattened_reviews = [review for review in flattened_reviews if review]

    return Counter(flattened_reviews).most_common(n_most_common)

In [26]:
getMostCommonWords(pos_reviews['french_text'], 15)

[('de', 8125),
 ('parc', 6350),
 ('pour', 6085),
 ('et', 5374),
 ('les', 3795),
 ('très', 3631),
 ('un', 3510),
 ('beau', 3264),
 ('la', 3110),
 ('le', 2947),
 ('des', 2541),
 ('en', 2281),
 ('à', 2241),
 ('avec', 1934),
 ('bien', 1706)]

In [27]:
getMostCommonWords(neg_reviews['french_text'], 15)

[('de', 2048),
 ('parc', 1001),
 ('les', 930),
 ('et', 930),
 ('pour', 920),
 ('le', 811),
 ('pas', 736),
 ('la', 720),
 ('un', 666),
 ('des', 499),
 ('à', 496),
 ('a', 476),
 ('mais', 471),
 ('en', 433),
 ('il', 426)]

The most common words involve a lot of common words. Therefore we will look at them without stopwords and determine the most common words. 

In [28]:
getMostCommonWords(pos_reviews['french_text'], 10, stopwords.words('french'))

[('parc', 6350),
 ('très', 3631),
 ('beau', 3264),
 ('bien', 1706),
 ('endroit', 1702),
 ('a', 1697),
 ('enfants', 1500),
 ('jeux', 1299),
 ('belle', 1267),
 ('cest', 1036)]

In [30]:
getMostCommonWords(neg_reviews['french_text'], 15, stopwords.words('french'))

[('parc', 1001),
 ('a', 476),
 ('très', 387),
 ('beau', 330),
 ('cest', 296),
 ('enfants', 278),
 ('bien', 269),
 ('petit', 225),
 ('beaucoup', 218),
 ('plus', 206),
 ('trop', 195),
 ('jeux', 192),
 ('endroit', 186),
 ('terrain', 174),
 ('peu', 160)]

## Text vectorization

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split

In [32]:
vectorizer = TfidfVectorizer(min_df=15)
bow = vectorizer.fit_transform(list(frenchReviewsDf['french_text']))
labels = frenchReviewsDf['label']

In [33]:
bow.shape

(17149, 1180)

In [34]:
vectorizer.get_feature_names()

['10',
 '12',
 '15',
 '18',
 '2017',
 '2018',
 '2019',
 '2020',
 '50',
 'abreuvoir',
 'abris',
 'absolument',
 'accessible',
 'accessibles',
 'accueil',
 'accueillant',
 'accès',
 'achalandé',
 'activité',
 'activités',
 'adapté',
 'adaptés',
 'admirer',
 'adore',
 'adorent',
 'adoré',
 'adultes',
 'afin',
 'agreable',
 'agréable',
 'agréables',
 'ai',
 'ailleurs',
 'aime',
 'aiment',
 'aimer',
 'aimé',
 'ainsi',
 'air',
 'aire',
 'aires',
 'ait',
 'alentours',
 'aller',
 'allez',
 'allé',
 'allée',
 'alors',
 'amateurs',
 'ambiance',
 'amener',
 'ami',
 'amis',
 'amour',
 'amoureux',
 'amusant',
 'amusement',
 'amusent',
 'amuser',
 'amélioration',
 'améliorer',
 'aménagement',
 'aménagements',
 'aménager',
 'aménagé',
 'aménagée',
 'aménagés',
 'an',
 'ancien',
 'and',
 'angus',
 'animation',
 'animaux',
 'animé',
 'anjou',
 'année',
 'années',
 'ans',
 'apaisant',
 'apporter',
 'apprécier',
 'apprécié',
 'après',
 'arbre',
 'arbres',
 'arc',
 'architecture',
 'argent',
 'arrondissem

In [35]:
len(vectorizer.get_feature_names())

1180

In [36]:
tfidfDict = dict(zip(vectorizer.get_feature_names(), bow.toarray()[0]))

In [38]:
tfidfDict['améliorer']

0.0

In [39]:
featureDf = pd.DataFrame.from_dict(tfidfDict, 
                                   orient='index', columns=['tfidf'])
featureDf.reset_index(inplace=True)
featureDf = featureDf.rename(columns = {'index':'feature'})

In [40]:
featureDf.sort_values('tfidf')[-10:]

Unnamed: 0,feature,tfidf
827,pour,0.16261
379,et,0.173683
152,beau,0.197146
1089,un,0.200496
398,faire,0.308118
295,de,0.313332
1079,tres,0.359868
664,nature,0.371318
558,la,0.425115
848,profiter,0.447984


Now we can take a look at the words that have the highest tfidf score in the positive and negative sentiment datasets.

In [43]:
vectorizer_pos = TfidfVectorizer(min_df=15)
bow_pos = vectorizer_pos.fit_transform(list(pos_reviews['french_text']))
labels_pos = pos_reviews['label']

In [44]:
vectorizer_neg = TfidfVectorizer(min_df=15)
bow_neg = vectorizer_neg.fit_transform(list(neg_reviews['french_text']))
labels_neg = neg_reviews['label']

In [45]:
tfidfDictPos = dict(zip(vectorizer_pos.get_feature_names(), bow_pos.toarray()[0]))
tfidfDictNeg = dict(zip(vectorizer_neg.get_feature_names(), bow_neg.toarray()[0]))

In [46]:
posFeatureDf = pd.DataFrame.from_dict(tfidfDictPos, 
                                   orient='index', columns=['tfidf'])
posFeatureDf.reset_index(inplace=True)
posFeatureDf = posFeatureDf.rename(columns = {'index':'feature'})

In [47]:
negFeatureDf = pd.DataFrame.from_dict(tfidfDictNeg, 
                                   orient='index', columns=['tfidf'])
negFeatureDf.reset_index(inplace=True)
negFeatureDf = negFeatureDf.rename(columns = {'index':'feature'})

In [48]:
posFeatureDf.sort_values('tfidf')[-15:]

Unnamed: 0,feature,tfidf
339,faites,0.0
340,familial,0.0
323,eu,0.0
322,ete,0.0
625,parc,0.149239
712,pour,0.161498
320,et,0.173906
129,beau,0.193378
930,un,0.20258
336,faire,0.309099


In [49]:
negFeatureDf.sort_values('tfidf')[-15:]

Unnamed: 0,feature,tfidf
108,gens,0.0
107,gazon,0.0
97,fait,0.0
313,être,0.0
105,fontaine,0.0
104,font,0.0
103,fois,0.0
102,fleuve,0.0
101,fin,0.0
100,fermé,0.0


In [137]:
# select 200 best features 
selected_features = SelectKBest(chi2, k=200).fit(bow, labels).get_support(indices=True)

In [146]:
# use selected features for vectorizer
vectorizer = TfidfVectorizer(min_df=15, vocabulary=selected_features)

bow2 = vectorizer.fit_transform(list(parkReviews['review_text']))
bow2

<41822x200 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [142]:
vectorizer.get_feature_names()

[20,
 37,
 76,
 85,
 93,
 109,
 124,
 130,
 140,
 141,
 143,
 149,
 150,
 164,
 173,
 177,
 182,
 185,
 201,
 202,
 215,
 220,
 238,
 243,
 262,
 275,
 295,
 297,
 299,
 301,
 343,
 353,
 355,
 363,
 383,
 392,
 396,
 397,
 407,
 411,
 436,
 443,
 450,
 454,
 464,
 478,
 486,
 493,
 494,
 495,
 496,
 500,
 511,
 512,
 515,
 518,
 533,
 534,
 540,
 541,
 564,
 574,
 606,
 608,
 609,
 618,
 639,
 684,
 701,
 721,
 728,
 729,
 736,
 748,
 761,
 767,
 779,
 788,
 816,
 821,
 843,
 861,
 864,
 873,
 892,
 913,
 919,
 922,
 924,
 929,
 962,
 967,
 968,
 969,
 1004,
 1005,
 1007,
 1011,
 1062,
 1063,
 1064,
 1077,
 1082,
 1092,
 1101,
 1148,
 1152,
 1158,
 1174,
 1192,
 1198,
 1200,
 1206,
 1211,
 1219,
 1220,
 1221,
 1222,
 1226,
 1228,
 1246,
 1248,
 1249,
 1258,
 1271,
 1292,
 1301,
 1309,
 1331,
 1336,
 1338,
 1355,
 1374,
 1399,
 1405,
 1406,
 1407,
 1418,
 1434,
 1441,
 1453,
 1473,
 1498,
 1501,
 1504,
 1509,
 1567,
 1640,
 1670,
 1686,
 1688,
 1689,
 1690,
 1692,
 1725,
 1759,
 1769,


In [147]:
vectorizer.vocabulary_.keys()

dict_keys([20, 37, 76, 85, 93, 109, 124, 130, 140, 141, 143, 149, 150, 164, 173, 177, 182, 185, 201, 202, 215, 220, 238, 243, 262, 275, 295, 297, 299, 301, 343, 353, 355, 363, 383, 392, 396, 397, 407, 411, 436, 443, 450, 454, 464, 478, 486, 493, 494, 495, 496, 500, 511, 512, 515, 518, 533, 534, 540, 541, 564, 574, 606, 608, 609, 618, 639, 684, 701, 721, 728, 729, 736, 748, 761, 767, 779, 788, 816, 821, 843, 861, 864, 873, 892, 913, 919, 922, 924, 929, 962, 967, 968, 969, 1004, 1005, 1007, 1011, 1062, 1063, 1064, 1077, 1082, 1092, 1101, 1148, 1152, 1158, 1174, 1192, 1198, 1200, 1206, 1211, 1219, 1220, 1221, 1222, 1226, 1228, 1246, 1248, 1249, 1258, 1271, 1292, 1301, 1309, 1331, 1336, 1338, 1355, 1374, 1399, 1405, 1406, 1407, 1418, 1434, 1441, 1453, 1473, 1498, 1501, 1504, 1509, 1567, 1640, 1670, 1686, 1688, 1689, 1690, 1692, 1725, 1759, 1769, 1814, 1815, 1816, 1858, 1861, 1862, 1863, 1867, 1871, 1899, 1900, 1901, 1903, 1918, 1929, 1953, 1955, 1958, 1962, 1970, 1971, 1985, 1991, 2006, 20

In [50]:
X_train, X_test, y_train, y_test = train_test_split(bow, labels, test_size=0.33)

In [51]:
# check out the dataset 
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(11489, 1180)
(5660, 1180)
(11489,)
(5660,)


### Using Random Forest classifier 

In [52]:
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

In [53]:
classifier = rfc()
classifier.fit(X_train,y_train)
classifier.score(X_test,y_test)

0.8740282685512367

In [54]:
classifier = rfc()

hyperparameters = {
    'n_estimators':stats.randint(10,300),
    'criterion':['gini','entropy'],
    'min_samples_split':stats.randint(2,9),
    'bootstrap':[True,False]
}

random_search = RandomizedSearchCV(classifier, hyperparameters, n_iter=65, n_jobs=4)

random_search.fit(bow, labels)

"classifier = rfc()\n\nhyperparameters = {\n    'n_estimators':stats.randint(10,300),\n    'criterion':['gini','entropy'],\n    'min_samples_split':stats.randint(2,9),\n    'bootstrap':[True,False]\n}\n\nrandom_search = RandomizedSearchCV(classifier, hyperparameters, n_iter=65, n_jobs=4)\n\nrandom_search.fit(bow, labels)"

In [None]:
optimized_classifier = random_search.best_estimator_
optimized_classifier.fit(X_train,y_train)

optimized_classifier.score(X_test,y_test)

In [None]:
print("train shape: " + str(X_train.shape))
print("score on test: " + str(optimized_classifier.score(X_test, y_test)))
print("score on train: "+ str(optimized_classifier.score(X_train, y_train)))

#### Taking a look at classification errors

In [55]:
optimized_classifier.fit(X_train,y_train)

correctly_classified = {}
incorrectly_classified = {}

for index, row in enumerate(X_test):
    probability = optimized_classifier.predict_proba(row)

    # get the location of the review in the dataframe
    review_loc = y_test.index[index]

    if optimized_classifier.predict(row) == y_test.iloc[index]:
        correctly_classified[parkReviews['review_text'].loc[review_loc]] = probability
    else:
        incorrectly_classified[parkReviews['review_text'].iloc[review_loc]] = probability

"optimized_classifier.fit(X_train,y_train)\n\ncorrectly_classified = {}\nincorrectly_classified = {}\n\nfor index, row in enumerate(X_test):\n    probability = optimized_classifier.predict_proba(row)\n\n    # get the location of the review in the dataframe\n    review_loc = y_test.index[index]\n\n    if optimized_classifier.predict(row) == y_test.iloc[index]:\n        correctly_classified[parkReviews['review_text'].loc[review_loc]] = probability\n    else:\n        incorrectly_classified[parkReviews['review_text'].iloc[review_loc]] = probability"

In [None]:
for review, score in incorrectly_classified.items():
    print('{}: {}'.format(review, score[0]))
    print('-----')

In [None]:
for review, score in correctly_classified.items():
    print('{}: {}'.format(review, score[0]))
    print('-----')

In [None]:
 incorrectly_classified

## Decision Tree

In [56]:
%%time

from sklearn.tree import DecisionTreeClassifier

clfdt = DecisionTreeClassifier(min_samples_split=30,max_depth=10)
clfdt.fit(X_train, y_train)

print("train shape: " + str(X_train.shape))
print("score on test: "  + str(clfdt.score(X_test, y_test)))
print("score on train: " + str(clfdt.score(X_train, y_train)))

train shape: (11489, 1180)
score on test: 0.8643109540636043
score on train: 0.8885890852119419
CPU times: user 200 ms, sys: 13.1 ms, total: 213 ms
Wall time: 228 ms


In [57]:
%%time

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bg=BaggingClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=3),max_samples=0.5,max_features=1.0,n_estimators=10)
bg.fit(X_train, y_train)

print("train shape: " + str(X_train.shape))
print("score on test: " + str(bg.score(X_test, y_test)))
print("score on train: "+ str(bg.score(X_train, y_train)))

train shape: (11489, 1180)
score on test: 0.8591872791519435
score on train: 0.8603881974062146
CPU times: user 463 ms, sys: 14.3 ms, total: 477 ms
Wall time: 507 ms


In [58]:
# boosting decision tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# setting 
# min_samples_split=10
# max_depth=4

adb = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=100,learning_rate=0.5)
adb.fit(X_train, y_train)

print("train shape: " + str(X_train.shape))
print("score on test: " + str(adb.score(X_test, y_test)))
print("score on train: "+ str(adb.score(X_train, y_train)))

train shape: (11489, 1180)
score on test: 0.8743816254416961
score on train: 0.8938114718426321


In [59]:
X_train[5]

<1x1180 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

## Naive Bayes

Sklearn Documentation:

- Naive Bayes: https://scikit-learn.org/stable/modules/naive_bayes.html
- MultinomialNB: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [60]:
from sklearn.naive_bayes import MultinomialNB

In [61]:
%%time
mnb = MultinomialNB().fit(X_train, y_train)

CPU times: user 6.85 ms, sys: 11.9 ms, total: 18.7 ms
Wall time: 36.9 ms


In [62]:
print("train shape: " + str(X_train.shape))
print("score on test: " + str(mnb.score(X_test, y_test)))
print("score on train: "+ str(mnb.score(X_train, y_train)))

train shape: (11489, 1180)
score on test: 0.873321554770318
score on train: 0.8766646357385325


## Logistic Regression 

Sklearn Documentation:

- LogisticRegression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
- SGD Classifier: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [64]:
%%time

lr=LogisticRegression(max_iter=5000)
lr.fit(X_train, y_train)

CPU times: user 513 ms, sys: 69.2 ms, total: 582 ms
Wall time: 757 ms


LogisticRegression(max_iter=5000)

In [65]:
print("train shape: " + str(X_train.shape))
print("score on test: " + str(lr.score(X_test, y_test)))
print("score on train: "+ str(lr.score(X_train, y_train)))

train shape: (11489, 1180)
score on test: 0.8763250883392226
score on train: 0.881016624597441


In [66]:
%%time

#logistic regression with stochastic gradient decent
sgd=SGDClassifier()
sgd.fit(X_train, y_train)

CPU times: user 25.7 ms, sys: 7.11 ms, total: 32.9 ms
Wall time: 43.3 ms


SGDClassifier()

In [67]:
print("train shape: " + str(X_train.shape))
print("score on test: " + str(sgd.score(X_test, y_test)))
print("score on train: "+ str(sgd.score(X_train, y_train)))

train shape: (11489, 1180)
score on test: 0.876678445229682
score on train: 0.8839759770214988


## K-nearest neighbors

In [68]:
%%time

from sklearn.neighbors import KNeighborsClassifier

#knn = KNeighborsClassifier(n_neighbors=5,algorithm = 'ball_tree')
knn = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1)

knn.fit(X_train, y_train)

print("train shape: " + str(X_train.shape))
print("score on test: " + str(knn.score(X_test, y_test)))
print("score on train: "+ str(knn.score(X_train, y_train)))

train shape: (11489, 1180)
score on test: 0.8454063604240283
score on train: 0.876403516406998
CPU times: user 21.9 s, sys: 12.5 s, total: 34.3 s
Wall time: 32.9 s


## Neural network pre-programmed

In [89]:
import keras

In [91]:
from keras import layers
from keras import models
from keras import optimizers
from keras import losses
from keras import metrics


In [None]:
# split an additional validation dataset
X_validation=X_train[:100]
X_partial_train=X_train[100:]
y_validation=y_train[:100]
y_partial_train=y_train[100:]
model=models.Sequential()
model.add(layers.Dense(16,activation='relu',input_shape=(30,)))
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_partial_train,y_partial_train,epochs=4,batch_size=512,validation_data=(X_validation,y_validation))

In [None]:
print('')
print("train shape: " + str(x_train.shape))
print("score on test: " + str(model.evaluate(x_test,y_test)[1]))
print("score on train: "+ str(model.evaluate(x_train,y_train)[1]))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#allReviewEn[['']]

train, test = train_test_split(allReviewEn, test_size = 0.3, random_state=42)

# clean the indexing
train.reset_index(drop=True),test.reset_index(drop=True)

# save train and test in csv files 
train[['review_text', 'label']].to_csv('all_en_train.csv', index=False)
test[['review_text', 'label']].to_csv('all_en_test.csv', index=False)

### Using Torchtest to processs text data

import numpy as np 

import torch 
import torchtext

from torchtext.legacy.data import Field, BucketIterator, TabularDataset, LabelField

import nltk 
nltk.download('punkt') # for punkt tokenizer

from nltk import word_tokenize 

# torchtext field parameter specifies how data should be processed, here tokenized
TEXT = Field(tokenize = word_tokenize)

LABEL = LabelField(dtype = torch.float) # convert 

datafields = [ ('review_text', TEXT), ('label', LABEL)] 

# specify what data that will work with, split to train and text, map to field 
trn, tst = TabularDataset.splits(path = '/Users/andreamock/Documents/review_datasets',
                               train = 'all_en_train.csv', test = 'all_en_test.csv', format = 'csv',
                               skip_header = True, fields = datafields)


# training examples 
trn[:5]

print(f'Number of training examples: {len(trn)}')
print(f'Number of testing examples: {len(tst)}')

# each example has label and text
trn[5].__dict__.keys()

trn[1].review_text # text has been tokenized in individual words

trn[1].label

# limit size of feature vectors to 15000, use one-encoding to get the top 15000 words in vocab
TEXT.build_vocab(trn, max_size = 15000)

LABEL.build_vocab(trn)

print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')
# two additional tokens were added to vocab, one for unknown words and another for padding to make sentences equal lengths

print(TEXT.vocab.freqs.most_common(50)) 

print(TEXT.vocab.itos[:10]) # integer to string mapping 0 and 1 to unknown and padding

batch_size = 64 

# returns a batch of examples where each example is of similar length (thus minimizing padding for each example)
train_iterator, test_iterator = BucketIterator.splits(
    (trn,tst), batch_size = batch_size, sort_key = lambda x: len(x.review_text), sort_within_batch = False
)

## Designing an RNN for binary text classification 

import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        # input_dim = input dimensions of words 
        # embedding_dim = dimension of word embeddings, dense word representation for training RNN
        # hidden_dim = dimension of hidden state of RNN
        # output_dim = output dimensions of RNN output
        
        super().__init__()
        #  convert one-hot encoded sentences to dense format using embeddings to represent each word
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # input to rnn is current word's embedding and previous hidden state, one word per time instance (memory cell)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        # fully connected layer to classify as positive or negative 
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, text):
        # input sentence (list of indexes of one hot encoded words) is represented using its embedding
        embedded = self.embedding(text)
        
        embedded_dropout = self.dropout(embedded)
        
        # output = concatentation of hidden state for every time step (ie word) [sentence length, batch size, hiddendim]
        # hidden = final hidden state fed into linear layer
        output, (hidden, _) = self.rnn(embedded_dropout)
        
        hidden_1D = hidden.squeeze(0) # get rid of unnecessary dimension 
        
        assert torch.equal(output[-1, :, :], hidden_1D) # confirm that it is indeed last hidden state 
        
        return self.fc(hidden_1D) # last hidden state fed into fully connected layer

# setting dimensions 
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim=1

model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

model # see what our model looks like

# train with optimizer
import torch.optim as optim 

optimizer = optim.Adam(model.parameters(), lr=1e-6)

# binary cross entropy with logits (cross-entropy for binary classification, 
# w/ sigmoid activation func to predict in range of 0 and 1)
criterion = nn.BCEWithLogitsLoss()

def train(model, iterator, optimizer, criterion): # helper function for training process
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:  # iterator over all batches of training data
        
        optimizer.zero_grad() # zero out gradients of optimizer
                
        predictions = model(batch.review_text).squeeze(1) # make predictions, squeeze to be 1d instead of [, ]
        
        loss = criterion(predictions, batch.label) # calculate loss
        
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.label).float() # how many were correct
        
        acc = correct.sum() / len(correct)
        
        loss.backward() # backward pass on rnn
        
        optimizer.step()
        
        epoch_loss += loss.item() # keep track of epoch loss and accuracy
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

num_epochs = 5

for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    print(f' Epoch: {epoch+1}, Train loss: {train_loss}, Train Acc: {train_acc*100:.2f}%')

Now we can test the accuracy on our test data.

# don't want to update the parameters when evaluating the accuracy
epoch_loss = 0
epoch_acc = 0

model.eval()

with torch.no_grad():

    for batch in test_iterator:

        predictions = model(batch.review_text).squeeze(1)

        loss = criterion(predictions, batch.label)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        
        correct = (rounded_preds == batch.label).float() 
        acc = correct.sum() / len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iterator)
test_acc  = epoch_acc / len(test_iterator)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')