In [106]:
# Importing required libraries
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
filepath = 'C://Users//fk2969//Desktop//twitter_sentiment_corpus.csv'

In [107]:
# Reading the dataset with Pandas
dataset = pd.read_csv(filepath)

# Extracting irrelevent sentiments
dataset = dataset[dataset.Sentiment != 'irrelevant']

# Splitting the data into tweets and sentiments
tweets = dataset["TweetText"]
sentiments = dataset["Sentiment"]

In [108]:
# Handling URLs on tweets
def handle_url(tweet):
    tweet = re.sub('(http:\/\/.*[\r\n]*)', '__URL', tweet)
    return tweet

In [109]:
# Handling usernames on tweets
def handle_username(tweet):
    tweet = re.sub('@[^\s]+','__USERNAME',tweet)
    return tweet

In [110]:
# Handling hashtags on tweets
def handle_hashtag(tweet):
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    return tweet

In [111]:
# Making all words on tweets proper
def handle_words(tweet):
    tweet = re.compile(r"(.)\1{1,}", re.IGNORECASE).sub(r"\1\1", tweet)
    return tweet

In [112]:
# Handling emoticons on tweets
emoticons = \
    [
     ('i like it',[ ':-)', ':)', '(:', '(-:', \
                       ':-D', ':D', 'X-D', 'XD', 'xD', \
                       ':-P', ':P', 'X-P', 'XP', 'xP', \
                       '<3', ':\*', ';-)', ';)', ';-D', ';D', '(;', '(-;', ] ),\
     ('i do not like it', [':-(', ':(', '):', ')-:', ':,(',\
                       ':\'(', ':"(', ':((', 
                        ':@', ':-@'] ),\
    ]
    
def replace_parenth(arr):
        return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr]
    
def regex_join(arr):
        return '(' + '|'.join( arr ) + ')'
    
def handle_emoticons(tweet):
    emoticons_regex = [(repl, re.compile(regex_join(replace_parenth(regx))) ) \
            for (repl, regx) in emoticons]
    
    for (repl, regx) in emoticons_regex :
        tweet = re.sub(regx, ' '+repl+' ', tweet)
    return tweet

In [113]:
# Stemming a tweet using "nltk" library
def tweet_stemmer(tweet):
    tweet_stem = ' '
    stemmer = nltk.stem.PorterStemmer()
    words = [word if(word[0:2]=='__') else word.lower() for word in tweet.split() if len(word) >= 2]
    words = [stemmer.stem(word) for word in words] 
    tweet_stem = ' '.join(words)
    return tweet_stem

In [114]:
# Preprocessing the dataset
def preprocess_data(tweets, sentiments):
    X = []; y= []
    for tweet, sentiment in zip(tweets, sentiments):
        tweet = tweet.strip('\'"')
        tweet = handle_url(tweet)
        tweet = handle_username(tweet)
        tweet = handle_hashtag(tweet)
        tweet = handle_words(tweet)
        tweet = handle_emoticons(tweet)
        tweet = tweet.lower()
        tweet_stem = tweet_stemmer(tweet)

        X.append(tweet_stem)
        y.append(sentiment)

    return X, y

In [115]:
# Creating the Random Forest Classifier model
# svm_clf = SVC(C=0.1, kernel='rbf') # Accuracy: 0.456 +/- 0.001
# svm_clf = SVC(C=1, kernel='linear') # Accuracy: 0.698 +/- 0.072
# nb_clf = BernoulliNB() # Accuracy: 0.624 +/- 0.195
# nb_clf = MultinomialNB() # Accuracy: 0.702 +/- 0.053
rf_clf = RandomForestClassifier(n_estimators=200, criterion='gini', \
                                max_depth=None, min_samples_split=4, \
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, \
                                max_features='auto', max_leaf_nodes=None, \
                                bootstrap=True, oob_score=True, n_jobs=-1, \
                                random_state=43, verbose=1, \
                                warm_start=True, class_weight=None)

In [116]:
# Creating vectorizer in pipeline 
X, y = preprocess_data(tweets, sentiments)
vec = TfidfVectorizer(min_df=10, max_df=0.95, sublinear_tf=True, use_idf=True, ngram_range=(1, 2))
vec_clf = Pipeline([('vectorizer', vec), ('pac', rf_clf)])
vec_clf.fit(X,y)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    0.6s finished


Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=10,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=...stimators=200, n_jobs=-1, oob_score=True, random_state=43,
            verbose=1, warm_start=True))])

In [117]:
print("Out-of-baggage Score: %0.3f" % rf_clf.oob_score_)

Out-of-baggage Score: 0.761


In [118]:
# Validating the model with K-Fold Cross Validation method
scores = cross_val_score(vec_clf, X, y, cv=10, n_jobs=-1, verbose=1)
print("10-Fold Cross Validation Scores: \n"+ str(scores))

[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   10.0s remaining:    6.6s


10-Fold Cross Validation Scores: 
[ 0.6744186   0.67151163  0.75510204  0.76608187  0.6871345   0.73099415
  0.69005848  0.70175439  0.70175439  0.68035191]


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.2s finished


In [119]:
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.706 (+/- 0.064)
