In [1]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', -1)
from time import time
import re
import string
import os
import emoji
from pprint import pprint
import collections
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)
from sklearn.metrics import roc_auc_score ,mean_squared_error,accuracy_score,classification_report,confusion_matrix,roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.svm import SVC  
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS , ImageColorGenerator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.externals import joblib
import gensim
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')
np.random.seed(37)

  from numpy.core.umath_tests import inner1d


In [2]:
#Loading the Train data
def load_train():
    df = pd.read_csv('data/train.csv')
    df = df[['tweet', 'sentiment']]
    df = df.dropna()
#     print(df.info())
    return df

# df = load_train()
# df.head()

In [3]:
#Loading the Train data
def load_test():
    df = pd.read_csv('data/test.csv')
    df = df[['tweet_id','tweet']]
#     print(df.info())
    return df

# dftest = load_test()
# df1 = dftest[['tweet']]
# df1.head()

In [4]:
class TextCounts(BaseEstimator, TransformerMixin):
    
    def count_regex(self, pattern, tweet):
        return len(re.findall(pattern, tweet))
    
    def fit(self, X, y=None, **fit_params):
        # fit method is used when specific operations need to be done on the train data, but not on the test data
        return self
    
    def transform(self, X, **transform_params):
        count_words = X.apply(lambda x: self.count_regex(r'\w+', str(x))) 
        count_mentions = X.apply(lambda x: self.count_regex(r'@\w+', str(x)))
        count_hashtags = X.apply(lambda x: self.count_regex(r'#\w+', str(x)))
        count_capital_words = X.apply(lambda x: self.count_regex(r'\b[A-Z]{2,}\b', str(x)))
        count_excl_quest_marks = X.apply(lambda x: self.count_regex(r'!|\?', str(x)))
        count_urls = X.apply(lambda x: self.count_regex(r'http.?://[^\s]+[\s]?', str(x)))
        # We will replace the emoji symbols with a description, which makes using a regex for counting easier
        # Moreover, it will result in having more words in the tweet
        count_emojis = X.apply(lambda x: emoji.demojize(str(x))).apply(lambda x: self.count_regex(r':[a-z_&]+:', str(x)))
        
        df = pd.DataFrame({'count_words': count_words
                           , 'count_mentions': count_mentions
                           , 'count_hashtags': count_hashtags
                           , 'count_capital_words': count_capital_words
                           , 'count_excl_quest_marks': count_excl_quest_marks
                           , 'count_urls': count_urls
                           , 'count_emojis': count_emojis
                          })
        return df


In [5]:
class CleanText(BaseEstimator, TransformerMixin):
    def remove_mentions(self, input_text):
        return re.sub(r'@\w+', '', str(input_text))
    
    def remove_urls(self, input_text):
        return re.sub(r'http.?://[^\s]+[\s]?', '', str(input_text))
    
    def emoji_oneword(self, input_text):
        # By compressing the underscore, the emoji is kept as one word
        return input_text.replace('_','')
    
    def remove_punctuation(self, input_text):
        # Make translation table
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        return input_text.translate(trantab)
    def remove_digits(self, input_text):
        return re.sub('\d+', '', str(input_text))
    
    def to_lower(self, input_text):
        return input_text.lower()
    
    def remove_stopwords(self, input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
    def stemming(self, input_text):
        porter = PorterStemmer()
        words = input_text.split() 
        stemmed_words = [porter.stem(word) for word in words]
        return " ".join(stemmed_words)
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        clean_X = X.apply(self.remove_mentions).apply(self.remove_urls).apply(self.emoji_oneword).apply(self.remove_punctuation).apply(self.remove_digits).apply(self.to_lower).apply(self.remove_stopwords).apply(self.stemming)
        return clean_X

In [8]:
df = load_train()  # load train data
df1 = load_test() # load test data

# create numerical feature for train
tc = TextCounts()
df_train = tc.fit_transform(df.tweet)
df_train['sentiment'] = df.sentiment
# create numerical feature for test
df_test = tc.fit_transform(df1['tweet'])
# clean text for train and CountVectorizer train data
ct = CleanText()
train_clean = ct.fit_transform(df['tweet'])
# clean text for test and CountVectorizer test data
ct = CleanText()
test_clean = ct.fit_transform(df1['tweet'])

df_model = df_train
df_model['clean_text'] = train_clean
df1_model = df_test
df1_model['clean_text'] = test_clean

X = df_model['clean_text']
X1 = df1_model['clean_text']
y = df_model.sentiment
tfidf = TfidfVectorizer(stop_words = 'english')
Xt = tfidf.fit_transform(X)
df_test = df1_model['clean_text']
df_test1 = tfidf.transform(df_test)

def LogReg(X,y):
    global model
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_score = model.predict(X_test)
#     print('Train Accuracy:\n',model.score(X_train,y_train))
#     print('Validation Accuracy:\n',model.score(X_test,y_test))
#     print('Classification Report:\n',classification_report(y_test, y_score))
#     print("Precision Score : ",precision_score(y_test, y_score,average='micro'))
#     print("Recall Score : ",recall_score(y_test, y_score,average='micro'))
    # calculating the f1 score for the validation set
    f1 = f1_score(y_test, y_score,average='micro')
    
    return f1

#trainning
X = Xt
y = df_model.sentiment
f1 = LogReg(X,y)    
print('f1 score is:',f1)

#testing function
def prediction(test):
    y_pred = model.predict(test)
    return y_pred



# Storing the Id column
Id = dftest[['tweet_id']]

#predicting on test file
y_pred = pd.DataFrame(prediction(df_test1),columns=['sentiment']) 
print(y_pred['sentiment'].value_counts())
submission = pd.concat([Id,y_pred['sentiment']],1)
submission.to_csv('submission.csv',index=False)

f1 score is: 0.6701030927835051
1    1420
2    391 
0    8   
Name: sentiment, dtype: int64
