### Dataset 
1. target: 0 = negative; 2 = neutral; 4 = positive;
2. ids
3. date
4. flag : query, if there is no query then value is NO_QUERY.
5. user
6. text

# 1. Importing libraries

In [1]:
import re
import string

import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings 
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## English Stopwords

In [3]:
import nltk 
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/freakash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/freakash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/freakash/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
data = pd.read_csv('./input/training.1600000.processed.noemoticon.csv', encoding = 'latin-1',header = None)
data.columns = ['target','id','date','flag','user','text']
data.head(10)
# data.tail(10)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


## Preprocessing the tweet text

In [7]:
data.shape

(1600000, 6)

In [8]:
def preprocess_tweet_data(tweet):
    tweet = tweet.lower()
    
    tweet = re.sub(r'http\S+|www\S+|https\S+',"", tweet, flags = re.MULTILINE)
    
    tweet = tweet.translate(str.maketrans("","",string.punctuation))
    
    tweet = re.sub(r'\@w+|\#',"",tweet)
    
    tweet_tokens = word_tokenize(tweet)
    
    filtered_tweet_words = [word for word in tweet_tokens if word not in stop_words]
    
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_tweet_words]
    
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos = 'a') for w in stemmed_words]
    
    return " ".join(lemma_words)

preprocess_tweet_data("Hi there, how are you preparing for your exams")

'hi prepar exam'

In [9]:
str.maketrans("","",string.punctuation)

{33: None,
 34: None,
 35: None,
 36: None,
 37: None,
 38: None,
 39: None,
 40: None,
 41: None,
 42: None,
 43: None,
 44: None,
 45: None,
 46: None,
 47: None,
 58: None,
 59: None,
 60: None,
 61: None,
 62: None,
 63: None,
 64: None,
 91: None,
 92: None,
 93: None,
 94: None,
 95: None,
 96: None,
 123: None,
 124: None,
 125: None,
 126: None}

In [10]:
data['text'] = data['text'].map(lambda a : preprocess_tweet_data(a))

In [11]:
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,switchfoot that bummer shoulda got david carr ...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset cant updat facebook text might cri resul...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,kenichan dive mani time ball manag save 50 res...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass behav im mad cant see


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
x = data['text']
y = data['target']
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size = 0.2, shuffle = True)
print(x_train.shape,x_test.shape,y_train.shape, y_test.shape)

(1280000,) (320000,) (1280000,) (320000,)


In [13]:
tv = TfidfVectorizer(ngram_range =(1,2), max_features = 10000, stop_words = 'english', sublinear_tf=True)
tv.fit(x_train)

TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english',
                sublinear_tf=True)

In [14]:
x_train = tv.transform(x_train)
x_test = tv.transform(x_test)

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
def evaluate_model(model):
    y_pred = model.predict(x_test)
    print(f"Accuracy Score {accuracy_score(y_test,y_pred)}")
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))

In [16]:
def estimate_result(model, tweet_text):
    tw_text = preprocess_tweet_data(tweet_text)
    return model.predict(tv.transform([tw_text]))

# Training models
## 1. Logistic Regression

In [17]:
log_model = LogisticRegression(C = 2, max_iter = 1000, n_jobs = -1)
log_model.fit(x_train, y_train)

LogisticRegression(C=2, max_iter=1000, n_jobs=-1)

In [18]:
y_pred = log_model.predict(x_test)
print(y_pred)

[0 4 0 ... 0 0 4]


In [19]:
evaluate_model(log_model)

Accuracy Score 0.772959375
[[119487  40525]
 [ 32128 127860]]
              precision    recall  f1-score   support

           0       0.79      0.75      0.77    160012
           4       0.76      0.80      0.78    159988

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



In [20]:
estimate_result(log_model,"life is hard man , life is hard")

array([0])

## 2. Naive Bayes Classifier

In [21]:

nb = MultinomialNB(100, fit_prior = True)
nb.fit(x_train,y_train)

 

MultinomialNB(alpha=100)

In [22]:
y_pred = nb.predict(x_test)
y_pred

array([0, 4, 0, ..., 0, 0, 4])

In [23]:
evaluate_model(nb)

Accuracy Score 0.758815625
[[120879  39133]
 [ 38046 121942]]
              precision    recall  f1-score   support

           0       0.76      0.76      0.76    160012
           4       0.76      0.76      0.76    159988

    accuracy                           0.76    320000
   macro avg       0.76      0.76      0.76    320000
weighted avg       0.76      0.76      0.76    320000



In [24]:
estimate_result(nb,"special is guest")

array([4])

# 3. SVM

In [None]:
svm_model = SVC(kernel = 'linear')
svm_model.fit(x_train,y_train)

In [None]:
y_pred = svm_model.predict(x_test)
print(y_pred)
evaluate_model(svm_model)

In [None]:

# GridSearchCV helps to loop through predefined hyperparameters and fit your estimator (model) on your training set. So, in the end, you can select the best parameters from the listed hyperparameters
from sklearn.model_selection import GridSearchCV

# for naive bayes
gcv=GridSearchCV(nb,{'alpha':[1.5,2,3,4,10,100,1.0,0.1,0.001,0.0001],'fit_prior':[True,False]})
gcv.fit(x_train,y_train)
print(gcv.best_score_,gcv.best_params_)

# for logistic regression
gcv = GridSearchCV(log_model,{'C' : [1,2,3,], 'max_iter' : [1000, 10000,5000], 'n_jobs' : [-1]})
gcv.fit(x_train,y_train)
print(gcv.best_score_,gcv.best_params_)

In [None]:
data[data.target == 0]

## Saving models

In [29]:


import pickle
file=open('vetorizer.pickle','wb')
pickle.dump(tv,file)
file.close()

file=open('log_model.pickle','wb')
pickle.dump(log_model,file)
file.close()

file = open('nb.pickle','wb')
pickle.dump(nb,file)
file.close()


## loading models

In [30]:
file=open('./vetorizer.pickle','rb')
vectorizer = pickle.load(file)
file.close()

file=open('./log_model.pickle','rb')
log_model = pickle.load(file)
file.close()

file = open('./nb.pickle','rb')
nb = pickle.load(file)
file.close()