In [1]:
# Loading the data and importing pandas

import pandas as pd

file=open('twitter_new.csv')
df=pd.read_csv(file, names=['sentiment','id','date','query','user','tweet'])

#taking 1 lack random samples, due to computational constraints

df=df.sample(n=100000)
df

Unnamed: 0,sentiment,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [2]:
# dropping unnecessary columns

df=df.drop('id',axis=1)
df=df.drop('date',axis=1)
df=df.drop('query',axis=1)
df=df.drop('user',axis=1)
df.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [3]:
# removing single letters, tags and links from the comment

def clean(x):
    x_list=x.split(' ')
    index_list=[]
    for i in x_list:
        if len(i)<2:
            index_list.append(x_list.index(i))
        elif i[0]=='@':
            index_list.append(x_list.index(i))
        elif len(i)>5:
            if i[0:4]=='http':
                index_list.append(x_list.index(i))
    index_list.sort(reverse=True)
    for i in index_list:
        x_list.pop(i)
    x= ' '.join(x_list)
    return(x)
    
df['tweet']=df['tweet'].apply(lambda x: clean(x))

df.head()


Unnamed: 0,sentiment,tweet
0,0,"Awww, that's bummer. You shoulda got David Car..."
1,0,is upset that he can't update his Facebook by ...
2,0,dived many times for the ball. Managed to save...
3,0,my whole body feels itchy and like its on fire
4,0,"no, it's not behaving at all. i'm mad. why am ..."


#### converting all text to lowercase

df['tweet']=df['tweet'].apply(lambda x:x.lower())

#### Removing numbers

import re
df['tweet']=df['tweet'].apply(lambda x:re.sub(r'\d+', '', x))

In [4]:
# removing punctuation

import string

translator = str.maketrans('', '', string.punctuation)
df['tweet']=df['tweet'].apply(lambda x: x.translate(translator))

#### Removing stopwords

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

df['tweet']=df['tweet'].apply(lambda x:remove_stopwords(x))

In [5]:
# Using stemmer i.e. removing suffixes

tokenized_tweet = df['tweet'].apply(lambda x: x.split())
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])


In [6]:
df['tweet']=tokenized_tweet.apply(lambda x: ' '.join(x))

In [7]:
# using vectorizer to transform text to numbers
# tried three vectorizers and kept the one which performed best

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
#vectorizer=CountVectorizer(ngram_range=(2,3))
vector=vectorizer.fit_transform(df['tweet'])
#bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2,max_features=1000, stop_words='english')
#bow = bow_vectorizer.fit_transform(df['tweet'])

In [8]:
# train test split

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(vector, df['sentiment'], random_state=42, test_size=0.25)

In [9]:
# tried logistic regression

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=100)
classifier.fit(x_train,y_train)
print(classifier.score(x_train,y_train))
print(classifier.score(x_test,y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7985075
0.7884725


In [10]:
from sklearn.metrics import classification_report

y_pred = classifier.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.77      0.78    199581
           4       0.78      0.81      0.79    200419

    accuracy                           0.79    400000
   macro avg       0.79      0.79      0.79    400000
weighted avg       0.79      0.79      0.79    400000



### Tried tree and KNN but logistic regression gave the most optimised results

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 10)
classifier.fit(x_train,y_train)
print(classifier.score(x_train,y_train))
print(classifier.score(x_test,y_test))