In [1]:
# Loading the data and importing pandas

import pandas as pd

file=open('twitter_new.csv')
df=pd.read_csv(file, names=['sentiment','id','date','query','user','tweet'])

#taking 1 lack random samples, due to computational constraints

df=df.sample(n=100000)
df

Unnamed: 0,sentiment,id,date,query,user,tweet
1406580,4,2055371687,Sat Jun 06 09:11:40 PDT 2009,NO_QUERY,xdahlia,It's 11:11 make a wish.
934634,4,1792535162,Wed May 13 23:25:51 PDT 2009,NO_QUERY,miguelsolorio,learning more about expression engine makes me...
71524,0,1693987193,Mon May 04 00:34:58 PDT 2009,NO_QUERY,ricky_chotai,@paul_fernley I did felt guilty after though i...
1584287,4,2190539196,Tue Jun 16 03:01:11 PDT 2009,NO_QUERY,neonp1nk,really needs to get a move on. Running behind ...
657613,0,2241091545,Fri Jun 19 10:31:50 PDT 2009,NO_QUERY,jonk,"looks like pwnagetool for mac is out, but stil..."
...,...,...,...,...,...,...
1452562,4,2063152509,Sun Jun 07 01:33:41 PDT 2009,NO_QUERY,Twinnadryl,Just gettin home... Allure was stooopid! Wow d...
248183,0,1982650574,Sun May 31 11:37:55 PDT 2009,NO_QUERY,taii_vuitton,my stupid new husband just sold my benz i wan...
744171,0,2267057246,Sun Jun 21 09:33:15 PDT 2009,NO_QUERY,MiraIzzati,I have to go to bed now
528454,0,2195096540,Tue Jun 16 10:40:25 PDT 2009,NO_QUERY,LINOOO,@Sarah_1991 I'm jealous too Do you know if sh...


from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp("I went to Delhi and visited Qutub Minar.")
displacy.serve(doc, style="dep")


In [2]:
# dropping unnecessary columns

df=df.drop('id',axis=1)
df=df.drop('date',axis=1)
df=df.drop('query',axis=1)
df=df.drop('user',axis=1)
df.head()

Unnamed: 0,sentiment,tweet
1406580,4,It's 11:11 make a wish.
934634,4,learning more about expression engine makes me...
71524,0,@paul_fernley I did felt guilty after though i...
1584287,4,really needs to get a move on. Running behind ...
657613,0,"looks like pwnagetool for mac is out, but stil..."


In [3]:
# removing single letters, tags and links from the comment

def clean(x):
    x_list=x.split(' ')
    index_list=[]
    for i in x_list:
        if len(i)<2:
            index_list.append(x_list.index(i))
        elif i[0]=='@':
            index_list.append(x_list.index(i))
        elif len(i)>5:
            if i[0:4]=='http':
                index_list.append(x_list.index(i))
    index_list.sort(reverse=True)
    for i in index_list:
        x_list.pop(i)
    x= ' '.join(x_list)
    return(x)
    
df['tweet']=df['tweet'].apply(lambda x: clean(x))

df.head()


Unnamed: 0,sentiment,tweet
1406580,4,It's 11:11 make wish.
934634,4,learning more about expression engine makes me...
71524,0,did felt guilty after though really should be ...
1584287,4,really needs to get move on. Running behind my...
657613,0,"looks like pwnagetool for mac is out, but stil..."


In [4]:
# Using stemmer i.e. removing suffixes

tokenized_tweet = df['tweet'].apply(lambda x: x.split())
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])


In [5]:
df['tweet']=tokenized_tweet.apply(lambda x: ' '.join(x))

In [6]:
# using vectorizer to transform text to numbers
# tried three vectorizers and kept the one which performed best

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
#vectorizer=CountVectorizer(ngram_range=(2,3))
vector=vectorizer.fit_transform(df['tweet'])
#bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2,max_features=1000, stop_words='english')
#bow = bow_vectorizer.fit_transform(df['tweet'])

In [7]:
# train test split

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(vector, df['sentiment'], random_state=42, test_size=0.25)

In [8]:
# tried logistic regression

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=100)
classifier.fit(x_train,y_train)
print(classifier.score(x_train,y_train))
print(classifier.score(x_test,y_test))

0.82492
0.77032


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
from sklearn.metrics import classification_report

y_pred = classifier.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77     12594
           4       0.76      0.78      0.77     12406

    accuracy                           0.77     25000
   macro avg       0.77      0.77      0.77     25000
weighted avg       0.77      0.77      0.77     25000



### Tried tree and KNN but logistic regression gave the most optimised results

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 10)
classifier.fit(x_train,y_train)
print(classifier.score(x_train,y_train))
print(classifier.score(x_test,y_test))