In [2]:
#import all libraries
import os
import pandas as pd
import re
import json
from sklearn.metrics import accuracy_score,classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

In [3]:
#get to the directory where our excel file is stored
cur_dir = os.getcwd()
os.chdir('../../')
os.chdir('Labelling')

In [4]:
files = os.listdir()
for file in files:
    if 'xlsx' in file:
        excel_file = file
        break

In [5]:
df = pd.read_excel(excel_file)

#get to our main directory after reading excel files        
os.chdir(cur_dir)

In [6]:
#we just need label and tweets column i.e we remove other unused columns
df = df[['Tweets','Label']]

In [7]:
df.head()

Unnamed: 0,Tweets,Label
0,yearn taste history from wine glass natural...,D
1,georgia winemakers guardians wine oldest ...,D
2,psst have plans better brighter year c...,P
3,banish whatever from wish leave behind ...,V
4,with winter coming like lion raise many gl...,D


In [8]:
#we get to know count of how many rows each label has
df.groupby('Label').count()

Unnamed: 0_level_0,Tweets
Label,Unnamed: 1_level_1
D,6580
P,9277
R,4667
S,9258
V,3234


In [21]:
print("Total Tweets: {}".format(int(df.groupby('Label').count().sum())))

Total Tweets: 33016


In [22]:
data = df['Tweets']
labels = df['Label']

naive_bayes = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

print("Model is being trained!!!")
print("Please Wait....")
%time naive_bayes.fit(x_train, y_train)
print("-----DONE-----")
print(classification_report(y_test, naive_bayes.predict(x_test), digits=4))
y_pred = naive_bayes.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))

Model is being trained!!!
Please Wait....
Wall time: 919 ms
-----DONE-----
              precision    recall  f1-score   support

           D     0.8153    0.8966    0.8540      1305
           P     0.6831    0.9216    0.7847      1876
           R     0.9454    0.3994    0.5615       954
           S     0.7496    0.8571    0.7998      1813
           V     0.9321    0.2302    0.3692       656

   micro avg     0.7548    0.7548    0.7548      6604
   macro avg     0.8251    0.6610    0.6738      6604
weighted avg     0.7901    0.7548    0.7290      6604

Accuracy: 0.7548455481526347


In [None]:
data = df['Tweets']
labels = df['Label']

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

decision_tree = Pipeline([
            ('vect', CountVectorizer( ngram_range=(2,4))),
            ('tfidf', TfidfTransformer()),
            ('clf', DecisionTreeClassifier(criterion='gini', splitter='best')),
])


print("Model is being trained!!!")
print("Please Wait....")
%time decision_tree.fit(x_train, y_train)
print("-----DONE-----")
print(classification_report(y_test, decision_tree.predict(x_test), digits=4))
y_pred = decision_tree.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))

Model is being trained!!!
Please Wait....


In [None]:
data = df['Tweets']
labels = df['Label']

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

random_forrest = Pipeline([
            ('vect', CountVectorizer( ngram_range=(2,4))),
            ('tfidf', TfidfTransformer()),
            ('clf', RandomForestClassifier(n_estimators=100,criterion='entropy'))
            ])

print("Model is being trained!!!")
print("Please Wait....")
%time random_forrest.fit(x_train, y_train)
print("-----DONE-----")
print(classification_report(y_test, random_forrest.predict(x_test), digits=4))
y_pred = random_forrest.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))