0 = negative
4 = positive

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
#dataset 
dataset_columns = ['target', 'id', 'date', 'flag', 'user', 'text']
dataset_encoding = 'ISO-8859-1'

dataset_path = '/content/drive/MyDrive/datasets/Twitter Sentiment140 Dataset/training.1600000.processed.noemoticon.csv'
df = pd.read_csv(dataset_path, encoding = dataset_encoding, names = dataset_columns)

In [None]:
df = df.drop(['id', 'date', 'flag', 'user'], axis=1)
df['target'] = df['target'].replace(4,1)
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
#CLEANING AND REMOVING STOPWORDS 

def cleaning_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in stop_words])

df['text'] = df.text.apply(lambda x: cleaning_stopwords(x))

In [None]:
#Cleaning Text (RT, Punctuation etc)
#Creating new dataframe and new features
import re 

df['text1'] = df.text

re_val = "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"

def cleaning(data):
    return re.sub(re_val , ' ', data)

df['text'] = df.text.apply(lambda x: cleaning(x))
df['text'] = df.text.apply(lambda x: cleaning_stopwords(x))
df.head()

Unnamed: 0,target,text,text1
0,0,Awww bummer You shoulda got David Carr Third D...,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,upset update Facebook texting might cry result...,upset can't update Facebook texting it... migh...
2,0,I dived many times ball Managed save 50 The re...,@Kenichan I dived many times ball. Managed sav...
3,0,whole body feels itchy like fire,whole body feels itchy like fire
4,0,behaving mad I see,"@nationwideclass no, behaving all. i'm mad. he..."


In [None]:
#applying lemmatization
lm = WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
df['text'] = df['text'].apply(lambda x: lemmatizer_on_text(x))
df['text'].head()

0    Awww bummer You shoulda got David Carr Third D...
1    upset update Facebook texting might cry result...
2    I dived many times ball Managed save 50 The re...
3                     whole body feels itchy like fire
4                                   behaving mad I see
Name: text, dtype: object

In [None]:
df = df.drop('text1', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X = df.text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB

model = Pipeline([('tfidf',TfidfVectorizer(analyzer='word')),('bnb',BernoulliNB())])

model.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('bnb', BernoulliNB())])

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score
y_predict = model.predict(X_test)


In [None]:
print("Confusion matrix:")
print(confusion_matrix(y_test,y_predict))
print("Classification report:")
print(classification_report(y_test,y_predict))
print("Accuracy Score:")
print(accuracy_score(y_test,y_predict))
print("Precision Score:")
print(recall_score(y_test,y_predict))
print("Recall Score:")
print(recall_score(y_test,y_predict))
print("F1 Score:")
print(f1_score(y_test,y_predict))

Confusion matrix:
[[184103  55258]
 [ 54308 186331]]
Classification report:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77    239361
           1       0.77      0.77      0.77    240639

    accuracy                           0.77    480000
   macro avg       0.77      0.77      0.77    480000
weighted avg       0.77      0.77      0.77    480000

Accuracy Score:
0.7717375
Precision Score:
0.7743175461999094
Recall Score:
0.7743175461999094
F1 Score:
0.7727921232280167


In [None]:
print(model.predict(["hate you"]))

[0]
