In [1]:
## Importing the libraries

import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from sklearn.metrics import f1_score

train = pd.read_csv('trainl.csv')
test = pd.read_csv('testl.csv')
print(train.shape); print(test.shape)

(7920, 3)
(1953, 2)


In [2]:
## This is typical sentiment Analysis problem.
# Customer Tweets related to tech firms who are manufacturers of mobiles, laptops are given to us.
# The task is to determine tweets which have negative sentiments towards such companies or products.
train.label.value_counts() #Most of the tweets have positive sentiments.

0    5894
1    2026
Name: label, dtype: int64

In [3]:
# train.isna().sum()
## Clearly there are no missing values.
## Data Preprocessing
## Not using deep learning models using simple ml algorithm - Logistic Regression.
# And so we will simply use frequency based embeddings loke tfidf or count vectorizer.
def clean_text(text):
    # firstly put all the texts in lower cases
    text = text.lower()
    text = text.replace('$&@*#', 'bakwas')
    text = text.replace('f**k', 'fuck')
    text = text.replace('@$$hole', 'asshole')
    text = text.replace('f#%*king', 'fucking')
    text = text.replace(':@', 'bakwas')
    return text
train['tweet']=train['tweet'].apply(lambda x: clean_text(x))
test['tweet']=test['tweet'].apply(lambda x: clean_text(x))

In [4]:
## Since twitter ID can be '@' followed by some alphanumeric we need to remove them.
# Because they are just ID's and will play any role in determining the sentiments.
def remove_user(text):
    r = re.findall('@[\w]*', text)
    for i in r:
        text = re.sub(i, '', text)
    return text    
train.tweet = train.tweet.apply(lambda x: remove_user(x))
test.tweet = test.tweet.apply(lambda x: remove_user(x))

In [5]:
## Similarly there are many URL's which we need to remove as they wont play any role in sentiments.
def remove_url(text):
    text = re.sub('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', text)
    return text   
train.tweet = train.tweet.apply(lambda x: remove_url(x))
test.tweet = test.tweet.apply(lambda x: remove_url(x))

In [16]:
## Now we will split our training data into train and validation so that we can do proper regularisation.
X_train, X_valid, y_train, y_valid = train_test_split(train['tweet'], train['label'], test_size = 0.1,
                                                      random_state=12)

In [20]:
## Part1 -- using count vectoriser and Naive Bayes Algorithm.
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)

model = MultinomialNB(alpha = 0.0925)
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_valid))
## Clearly our submissions are evaluated on the basis of F1Score
print(f1_score(y_valid, predictions))

0.819672131147541


In [8]:
## Part2 -- using tfidf vectorizer and Naive Bayes Algorithm.
tfvect = TfidfVectorizer().fit(X_train)
X_train_vectorized = tfvect.transform(X_train)

model = MultinomialNB(alpha = 0.0955)
model.fit(X_train_vectorized, y_train)
predictions = model.predict(tfvect.transform(X_valid))
print(f1_score(y_valid, predictions))

0.8253275109170305


In [9]:
## Part3 -- using count vectoriser and Logistic Regression Algorithm.
vect = CountVectorizer(min_df=2, ngram_range=(1,3)).fit(X_train)
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression(C = 1.6, solver = 'sag')
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_valid))
print(f1_score(y_valid, predictions))

0.8301886792452831




In [10]:
## Part4 -- using tfidf vectorizer and Logistic Regression Algorithm.
## Word Level tf idf vectorizer.

text = pd.concat([train.tweet, test.tweet])
Tfword_vectorizer = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='word',ngram_range=(1, 3),max_features=10000).fit(text)
word_train_vectorized = Tfword_vectorizer.transform(X_train)
word_valid_vectorized = Tfword_vectorizer.transform(X_valid)
word_test_vectorized = Tfword_vectorizer.transform(test.tweet)

In [11]:
## Character level tf idf vectoriser.
Tfchar_vectorizer = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='char',ngram_range=(1, 15),max_features=50000).fit(text)
char_train_vectorized = Tfchar_vectorizer.transform(X_train)
char_valid_vectorized = Tfchar_vectorizer.transform(X_valid)
char_test_vectorized = Tfchar_vectorizer.transform(test.tweet)

In [12]:
## Horizontally stacking the tf idf vectorizers.
train_features = hstack([char_train_vectorized, word_train_vectorized])
valid_features = hstack([char_valid_vectorized, word_valid_vectorized])
test_features = hstack([char_test_vectorized, word_test_vectorized])

In [13]:
model = LogisticRegression(max_iter=300,C=2.0,solver='sag')
model.fit(train_features, y_train)
predictions = model.predict(valid_features)
pred_y = model.predict(test_features)
print(f1_score(y_valid, predictions))

0.8364485981308412
