# Twitter Sentiment Analysis

<b>Created by Aruna on 31/5/19</b>

## I. Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import csv
import string
from nltk import corpus
from nltk import word_tokenize
from nltk import pos_tag
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aruna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aruna\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## II. Text cleaning functions

### Step #1: Make list of tweets of string type

In [2]:
def makeListOfTweets(A):
    tw = []
    
    for item in A['tweet']:
        tw.append(str(item))
    
    return tw

### Step #2: Replace symbols like #, &amp; with whitespace and remove any hex characters or '@user'

In [3]:
def removeSymbols(A):
    for i in range(0, len(A)):
        A[i] = A[i].replace("@user", "")
        A[i] = A[i].replace("#", " ")
        A[i] = A[i].replace("&amp;", " ")
        A[i] = A[i].encode('ascii', errors='ignore').decode("utf-8")
        
    return A

### Step #3: Tokenize the tweets

In [4]:
def tokenizeTweets(A):
    tokens = []
    for t in A:
        tokens.append(word_tokenize(t))
    return tokens

## III. Load the training CSV data using pd

In [5]:
tweets = pd.read_csv("train.csv")
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [6]:
tw = makeListOfTweets(tweets)
tw = removeSymbols(tw)
# tokens = tokenizeTweets(tw)

tweets['tweet'] = tw

tweets

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for lyft credit i can't use cause th...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in ...
4,5,0,factsguide: society now motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,camping tomorrow danny
7,8,0,the next school year is the year for exams. ca...
8,9,0,we won!!! love the land!!! allin cavs champ...
9,10,0,welcome here ! i'm it's so gr8 !


## IV. Loading the test CSV data using pd 

In [7]:
test = pd.read_csv("test_tweets.csv")
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [8]:
tt = makeListOfTweets(test)
tt = removeSymbols(tt)
# testtokens = tokenizeTweets(tt)

test['tweet'] = tt

test

Unnamed: 0,id,tweet
0,31963,studiolife aislife requires passion dedic...
1,31964,white supremacists want everyone to see th...
2,31965,safe ways to heal your acne!! altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd bihday to my amazing, hilarious nephew..."
5,31968,choose to be :) momtips
6,31969,something inside me dies eyes ness smokeyey...
7,31970,finished tattoo inked ink loveit thanks al...
8,31971,i will never understand why my dad left me...
9,31972,delicious food lovelife capetown mannaep...


## V. Now to build the model

In [9]:
X_train = tweets['tweet']
X_test = test['tweet']
y_train = tweets['label']
y_test = []

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set
X_train_tfidf.shape

(31962, 39257)

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('rfc',LinearSVC()),])

text_clf.fit(X_train, y_train)  

In [None]:
predictions = text_clf.predict(X_test)

predictions

In [None]:
test['label'] = predictions

In [None]:
test['label']

In [None]:
test['label'].value_counts()

In [None]:
test['id'][5]

## VI. Finally, store the model in a new CSV file

In [None]:
with open('test_predictions.csv', 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['id', 'label'])
    
    for i in range(0, len(test['label'])):
        writer.writerow([test['id'][i], test['label'][i]])

In [11]:
import xgboost as xgb

xg = xgb.XGBClassifier(gamma=1,                 
    learning_rate=0.01,
    max_depth=3,
    n_estimators=10000,                                                                    
    subsample=0.8,
    random_state=34)

In [12]:
xg.fit(X_train_tfidf, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=10000,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=34, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.8)

In [None]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('rfc', XGBClassifier()),])

text_clf.fit(X_train, y_train)  

In [13]:
predictions = xg.predict(X_test)

TypeError: can not initialize DMatrix from Series