In [129]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display, HTML

In [49]:
data = pd.read_csv('../Anti-National-Tweets-Classification/dataset/labelled/data_main.csv')
data = data[pd.notnull(data['tweet'])]
data_shuffled = pd.DataFrame()

data = data.sample(data.shape[0])
data.shape

(10529, 5)

In [74]:
X = data['tweet']
Y = data['label']
x_train,  x_test,y_train, y_test = train_test_split(X, Y, train_size=0.8,test_size=0.2, random_state=True)

In [126]:
x_train.head()

1266    b want justice solution khalistan cant tollara...
6512          road cant wait see everyone bootcamp nustar
9444    happy work conference right mindset lead cultu...
4993    great terry fox pathetic annoy khalistan refer...
7733    enjoy moment everydaybeautiful zen mindfulness...
Name: tweet, dtype: object

In [127]:
y_train.value_counts()

1.0    4228
0.0    4195
Name: label, dtype: int64

In [128]:
y_test.value_counts()

0.0    1089
1.0    1017
Name: label, dtype: int64

In [78]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_train_counts.shape

(8423, 10036)

In [79]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf.shape

(8423, 10036)

In [130]:
naive_bayes = MultinomialNB().fit(x_train_tfidf, y_train)
def Naive_Bayes_Classifier(x_test):
    x_test_counts = count_vect.transform(x_test)
    x_test_tfidf = tfidf_transformer.fit_transform(x_test_counts)
    
    predictions = naive_bayes.predict(x_test_tfidf)
    
    return predictions

predictions = Naive_Bayes_Classifier(x_test)
acc = np.mean(predictions == y_test)
print(acc)

0.9791073124406457


In [131]:
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train_tfidf, y_train)

def Logistic_Regression_Classifier(x_test):
    # all parameters not specified are set to their defaults
    x_test_counts = count_vect.transform(x_test)
    x_test_tfidf = tfidf_transformer.fit_transform(x_test_counts) 
    
    predictions = logisticRegr.predict(x_test_tfidf)
    
    return(predictions)

predictions = Logistic_Regression_Classifier(x_test)
acc = np.mean(predictions == y_test)
print(acc)

0.9886039886039886




In [132]:
decisionTree = DecisionTreeClassifier()

# Train Decision Tree Classifer
decisionTree = decisionTree.fit(x_train_tfidf, y_train)

def Decision_Tree_Classifier(x_test):
    x_test_counts = count_vect.transform(x_test)
    x_test_tfidf = tfidf_transformer.fit_transform(x_test_counts)
    
    predictions = decisionTree.predict(x_test_tfidf)
    
    return predictions

predictions = Decision_Tree_Classifier(x_test)
acc = np.mean(predictions == y_test)
print(acc)

0.9520417853751187


In [133]:
randomForest = RandomForestClassifier(n_estimators=100)

randomForest = randomForest.fit(x_train_tfidf, y_train)

def Random_Forest_Classifier(x_test):
    x_test_counts = count_vect.transform(x_test)
    x_test_tfidf = tfidf_transformer.fit_transform(x_test_counts)
    
    predictions = randomForest.predict(x_test_tfidf)
    
    return predictions

predictions = Random_Forest_Classifier(x_test)
acc = np.mean(predictions == y_test)
print(acc)

0.976258309591643


In [138]:
text = "I watch Pokemon all day long! HAHA"
prediction = Naive_Bayes_Classifier([text])

if(prediction == 1):
    print(text,'\nClassified as Positive')
else:
    print(text,'\nClassified as Negative')

I watch Pokemon all day long! HAHA 
Classified as Negative


In [139]:
import pickle
pickle.dump(logisticRegr, open('../Anti-National-Tweets-Classification/models/logistic_regression_model.pkl','wb')) 
pickle.dump(decisionTree, open('../Anti-National-Tweets-Classification/models/decision_tree_model.pkl','wb')) 
pickle.dump(randomForest, open('../Anti-National-Tweets-Classification/models/random_forest_model.pkl','wb'))