In [65]:
import csv

## Reading the data set

In [85]:
with open('Tweets.csv') as csv_file:
    read_CSV = csv.reader(csv_file, delimiter=',')
    # sentiment col # 1
    Y = list()
    # reviews col # 10
    X_text = list()
    for row in read_CSV:
        Y.append(row[1])
        X_text.append(row[10])
    Y = Y[1:]
    X_text = X_text[1:]

In [86]:
print(Y[0],X_text[0])

neutral @VirginAmerica What @dhepburn said. 1.0 Virgin America


## Filter tweets

In [89]:
from langdetect import detect
X_text_new = list()
Y_new = list()
sentiment_confidence_new = list()
airlines_new = list()

for idx, tweet in enumerate(X_text.copy()):
    if 'RT' in tweet:
        continue
    
    if len(tweet) < 20:
        continue
        
    if detect(tweet) != 'en':
        continue
        
    X_text_new.append(X_text[idx])
    Y_new.append(Y[idx])
    
X_text = X_text_new
Y = Y_new

## Cleaning the text

In [71]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from string import punctuation
import re

# remove urls and strip
X_text = [re.sub(r'https?:\/\/.*[\r\n]*', '', text).strip() for text in X_text]

# remove punctuation
punctuation += "“’…”"
X_text = [''.join([char for char in text if char not in punctuation]) for text in X_text]

# tokenize tweets
X_tokenized = [word_tokenize(text) for text in X_text]

# stemming
snowball_stemmer = nltk.stem.SnowballStemmer('english')
X_tokenized = [[snowball_stemmer.stem(word) for word in words] for words in X_tokenized]

# remove stopwords
X_tokenized = [[word for word in words if word not in stopwords.words('english')] for words in X_tokenized]

X = [' '.join(words) for words in X_tokenized]



In [72]:
print(X[:20])

['virginamerica dhepburn said', 'virginamerica plus youv ad commerci experi tacki', 'virginamerica didnt today must mean need take anoth trip', 'virginamerica realli aggress blast obnoxi entertain guest face amp littl recours', 'virginamerica realli big bad thing', 'virginamerica serious would pay 30 flight seat didnt play realli onli bad thing fli va', 'virginamerica yes near everi time fli vx ear worm wont go away', 'virginamerica realli miss prime opportun men without hat parodi', 'virginamerica amaz arriv hour earli good', 'virginamerica know suicid second lead caus death among teen 1024', 'virginamerica lt3 pretti graphic much better minim iconographi', 'virginamerica great deal alreadi think 2nd trip australia amp havent even gone 1st trip yet p', 'virginamerica virginmedia im fli fabul seduct sky u take stress away travel', 'virginamerica thank', 'virginamerica sfopdx schedul still mia', 'virginamerica excit first cross countri flight lax mco ive heard noth great thing virgin am

## Split the dataset

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

## Extract features using tf-idf

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Calculating the tf-idf scores
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [75]:
print(X_train_transformed.shape)
print(X_test_transformed.shape)

(11041, 10703)
(2761, 10703)


## Use the classifiers
The three classifiers to be used (import them from sklearn) are:

a) Multinomial Naive Bayes

b) K Nearest Neighbors Classifier

c) Random Forest Classifier

### Multinomial Naive Bayes

In [76]:
from sklearn.naive_bayes import MultinomialNB

# Initialize classifier
naive_bayes_clf = MultinomialNB()

# Train classifier
naive_bayes_clf.fit(X_train_transformed, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [77]:
print(naive_bayes_clf.score(X_test_transformed, y_test))

0.6812749003984063


#### F1 Score

In [78]:
from sklearn.metrics import f1_score

y_pred = naive_bayes_clf.predict(X_test_transformed)

score = f1_score(y_test, y_pred, average='micro')
print(score)

0.6812749003984063


### K Nearest Neighbors Classifier

In [79]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize classifier
k_nearest_clf = KNeighborsClassifier(n_neighbors=3)

# Train classifier
k_nearest_clf.fit(X_train_transformed, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

#### F1 Score

In [80]:
from sklearn.metrics import f1_score

y_pred = k_nearest_clf.predict(X_test_transformed)

score = f1_score(y_test, y_pred, average='micro')
print(score)

0.5896414342629482


## Random Forest Classifier

In [81]:
from sklearn.ensemble import RandomForestClassifier

# Initialize classifier
random_forest_clf = RandomForestClassifier()

# Train classifier
random_forest_clf.fit(X_train_transformed, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#### F1 Score

In [82]:
from sklearn.metrics import f1_score

y_pred = random_forest_clf.predict(X_test_transformed)

score = f1_score(y_test, y_pred, average='micro')
print(score)

0.7540746106483158


# Bonus

#### Work on the larger dataset

In [10]:
import csv

with open('train.csv', encoding="ISO-8859-1") as csv_file:
    read_CSV = csv.reader(csv_file, delimiter=',')
    # sentiment col # 1
    Y = list()
    # reviews col # 10
    X_text = list()
    for row in read_CSV:
        Y.append(row[0])
        X_text.append(row[5])
    Y = Y[1:]
    X_text = X_text[1:]

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from string import punctuation
import re

# remove urls and strip
X_text = [re.sub(r'https?:\/\/.*[\r\n]*', '', text).strip() for text in X_text]
print("Done removing urls..")

# remove punctuation
punctuation += "“’…”"
X_text = [''.join([char for char in text if char not in punctuation]) for text in X_text]
print("done removing punctuation..")

# tokenize tweets
X_tokenized = [word_tokenize(text) for text in X_text]
print("done tokenizing..")

# stemming
snowball_stemmer = nltk.stem.SnowballStemmer('english')
X_tokenized = [[snowball_stemmer.stem(word) for word in words] for words in X_tokenized]
print("done stemming..")

# remove stopwords
X_tokenized = [[word for word in words if word not in stopwords.words('english')] for words in X_tokenized]

X = [' '.join(words) for words in X_tokenized]
print("done removing stop words..")

Done removing urls..
done removing punctuation..
done tokenizing..
done stemming..
done removing stop words..


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Calculating the tf-idf scores
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [17]:
from sklearn.ensemble import RandomForestClassifier

# Initialize classifier
random_forest_clf = RandomForestClassifier()

# Train classifier
random_forest_clf.fit(X_train_transformed, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
from sklearn.metrics import f1_score

y_pred = random_forest_clf.predict(X_test_transformed)

score = f1_score(y_test, y_pred, average='micro')
print(score)

0.749596875
