Sentiment Analysis with Kaggle Dataset

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
filename = '/home/yog/Sentiment_analysis/train.csv'
dataset = pd.read_csv(filename, encoding = 'latin-1') #data encoding in latin-1

In [3]:
dataset.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [4]:
def preprocess_tweet(tweet):
    #convert the tweet to lower case
    tweet = tweet.lower()
    #convert all urls to sting "URL"
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #convert all @username to "AT_USER"
    tweet = re.sub('@[^\s]+','AT_USER', tweet)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)
    #convert "#topic" to just "topic"
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    return tweet

In [5]:
def feature_extraction(data, method):
    #feature extractions: "tfidf" Vectorization
    if method == "tfidf":
        from sklearn.feature_extraction.text import TfidfVectorizer
        tfv=TfidfVectorizer(sublinear_tf=True, stop_words = "english")
        features=tfv.fit_transform(data)
    else:
        return "Incorrect inputs"
    return features

In [6]:
dataset['SentimentText'] = dataset['SentimentText'].apply(preprocess_tweet)

In [7]:
data = np.array(dataset.SentimentText)
label = np.array(dataset.Sentiment)

In [8]:
features = feature_extraction(data, method = "tfidf")

In [9]:
def train_classifier(features, label, classifier ): 
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.15, random_state = 0)
    if classifier == "logistic_regression":
        from sklearn.linear_model import LogisticRegression
        model = LogisticRegression(C=1.)
    elif classifier == "naive_bayes":
        from sklearn.naive_bayes import MultinomialNB
        model = MultinomialNB()
    else:
        print('Invalid Selection')
    #fit model to data
    model.fit(features, label)
    #make prediction on the data
    y_pred = model.predict_proba(X_test)[:,1]
    from sklearn.metrics import accuracy_score
    predictions = [round(value) for value in y_pred]
    #cm = confusion_matrix(y_test, predictions)
    Accuracy_Score = accuracy_score(y_test, predictions)
    #print('Confusion Matrix: ',cm)
    print('Accuracy of Model:', Accuracy_Score)

In [10]:
train_classifier(features, label, "naive_bayes")

Accuracy of Model: 0.8217881192079471
