In [35]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB 
from nltk.stem.snowball import FrenchStemmer
from xgboost import XGBClassifier

In [36]:
Corpus= pd.read_csv('./nlp/tweets.csv', sep='^([^,]+),', engine='python', error_bad_lines=False, encoding='utf-8', index_col=[0])

In [37]:
Corpus.columns=['polarity','text']

In [38]:
Corpus = Corpus.sample(n=1000)
Corpus = Corpus.reset_index(drop=True)

In [39]:
Corpus['text']=Corpus['text'].astype(str)
Corpus['text'] = [entry.lower() for entry in Corpus['text']]
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['text']):
    Final_words = []
    stemmer = FrenchStemmer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('french') and word.isalpha():
            word_Final = stemmer.stem(word)
            Final_words.append(word_Final)
    Corpus.loc[index,'text_final'] = str(Final_words)

In [40]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['polarity'],test_size=0.3)

In [41]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [42]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [43]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  68.33333333333333


In [44]:
XGboost = XGBClassifier()
XGboost.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_xgb = XGboost.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
print("XGboost Accuracy Score -> ",accuracy_score(predictions_xgb, Test_Y)*100)

XGboost Accuracy Score ->  57.666666666666664


In [45]:
rfc = RandomForestClassifier()
rfc.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_rfc = rfc.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
print("Random Forest Accuracy Score -> ",accuracy_score(predictions_rfc, Test_Y)*100)

Random Forest Accuracy Score ->  59.0




In [48]:
from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# build a classifier
clf = RandomForestClassifier(n_estimators=20)

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [1, 3, 5, None],
              "max_features": sp_randint(1, 10),
              "min_samples_split": sp_randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=10, iid=False)

start = time()
random_search.fit(Train_X_Tfidf,Train_Y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {"max_depth": [1, 3, 5, None],
              "max_features": [2, 3, 9],
              "min_samples_split": [2, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10, iid=False)
start = time()
grid_search.fit(Train_X_Tfidf,Train_Y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

RandomizedSearchCV took 13.11 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.611 (std: 0.032)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 4}

Model with rank: 2
Mean validation score: 0.607 (std: 0.041)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 5}

Model with rank: 3
Mean validation score: 0.606 (std: 0.043)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 6, 'min_samples_split': 10}

GridSearchCV took 91.46 seconds for 144 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.623 (std: 0.052)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 3}

Model with rank: 2
Mean validation score: 0.610 (std: 0.043)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': No