In [74]:
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import numpy as np
import re
import gensim
import nltk
from sklearn.model_selection import train_test_split
import matplotlib
import codecs
import codecs
import itertools
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import nameparser


In [45]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_sub = pd.read_csv('sample_submission.csv')


In [46]:

def text_standardizer(text,area):
    yer = ['!','@','#','$','%','_',',','\n','?','<','>']
    for key in yer:
        text[area]=text[area].apply(lambda x:str(x).replace(key,''))
    return text


test_data = text_standardizer(test, 'Tweet')
# colTokens = pd.Series([[y for y in x if y != ''] for x in colTokens], index=colTokens.index)
# colTokens = test['Tweet'].str.split(' ')





In [47]:
tokenizer = RegexpTokenizer(r'\w+')
test_data['Tokens'] = test_data['Tweet'].apply(tokenizer.tokenize)

In [48]:
all_words = [word for tokens in test_data["Tokens"] for word in tokens]

sentence_lengths = [len(tokens) for tokens in  test_data["Tokens"]]

VOCAB = sorted(list(set(all_words)))

print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))

15055 words total, with a vocabulary size of 4555


In [49]:
# count_all_words=Counter(all_words)
# count_all_words.most_common((100))


In [50]:
list_corpus = train['text']
list_labels = train['class_label']
X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, random_state=40)

In [51]:
word2vec_path = "GoogleNews-vectors-negative300.bin"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)


In [52]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list) < 1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_questions_tokens, generate_missing=False):
    embeddings = clean_questions_tokens.apply(lambda x: get_average_word2vec(x, vectors, generate_missing=generate_missing))
                                                                            
    return list(embeddings)

embeddings = get_word2vec_embeddings(word2vec, train['tokens'])


In [53]:
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(embeddings, list_labels,  test_size=0.2, random_state=40)

In [54]:
w2v = dict()
w2v["train"] = (X_train_w2v, y_train_w2v)
w2v["test"] = (X_test_w2v, y_test_w2v)


In [55]:
embedding = w2v

In [56]:
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial', random_state=40)

In [57]:
classifier= lr_classifier

In [58]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [59]:
classifier.fit(*embedding["train"])
y_predict = classifier.predict(embedding["test"][0])

In [60]:
accuracy, precision, recall, f1 = get_metrics(embedding["test"][1], y_predict)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.564, precision = 0.642, recall = 0.564, f1 = 0.600


In [61]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.winter):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=20)
    plt.yticks(tick_marks, classes, fontsize=20)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", 
                 color="white" if cm[i, j] < thresh else "black", fontsize=40)
    
    plt.tight_layout()
    plt.ylabel('True label', fontsize=30)
    plt.xlabel('Predicted label', fontsize=30)

    return plt

In [62]:
cm = confusion_matrix(embedding["test"][1], y_predict)
fig = plt.figure(figsize=(10, 10))
plot = plot_confusion_matrix(cm, classes=['Irrelevant','Disaster', 'Unsure'], normalize=False, title='Confusion Matrix')
plt.show()

In [63]:
test_corpus = test['Tweet']
test_Id = test['Id']

In [64]:
test_corpus_tokens = test_corpus.apply(tokenizer.tokenize)

In [65]:

count_vectorizer = CountVectorizer(analyzer='word', token_pattern=r'\w+')
vectorized_text = dict()

vectorized_text['test']  =get_word2vec_embeddings(word2vec, test_corpus_tokens)

In [66]:
embedding = vectorized_text                
classifier = lr_classifier     
predicted_sentiment = classifier.predict(embedding['test']).tolist()

In [68]:
results = pd.DataFrame(
    {'Id': test_Id,
     'Expected': predicted_sentiment
    })

In [72]:
results

Unnamed: 0,Id,Expected
0,0,0
1,1,0
2,2,0
3,3,0
4,4,1
5,5,1
6,6,1
7,7,0
8,8,0
9,9,2
