In [None]:
import os
import pandas as pd
import numpy as np
import re
import time
import emoji
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chuqinwu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chuqinwu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/chuqinwu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
for dirname, _, filenames in os.walk('data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data/test.csv
data/train.csv
data/sample_submission.csv


In [None]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [None]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [None]:
train_df = train_df.drop_duplicates('text',ignore_index=True)
len(train_df)

7503

In [None]:

def removeEmoji(text):
    return emoji.replace_emoji(text, '')

#https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def tweet_cleaner(text):
    text = text.lower() #convert to lowercase
    text = re.sub(" \d+", " ", text)
    text = text.translate(str.maketrans("","", string.punctuation)) #remove punctuation
    text = removeEmoji(text) #remove emoji
    tk = WhitespaceTokenizer()  #tokenize text to list of words without space
    textsplit = tk.tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tag = nltk.pos_tag(textsplit) #get tags (noun, verb...)

    result = []
    for i,w in enumerate(tag):
        word = w[0]
        tag = w[1]
        if word not in set(stopwords.words("english")) and not word.startswith("http") and "\\" not in word and '#' not in word and '@' not in word:
            result.append(lemmatizer.lemmatize(word,get_wordnet_pos(tag))) #lemmartize word to current tense

        #text = ' '.join(result) #join back the result
    return result

In [None]:
from collections import Counter
def bag_of_words(ls):
    return Counter(ls)

# Taken from http://web.stanford.edu/class/cs221/ Assignment #2 Support Code
def dotProduct(d1, d2):
    """
    @param dict d1: a feature vector represented by a mapping from a feature (string) to a weight (float).
    @param dict d2: same as d1
    @return float: the dot product between d1 and d2
    """
    if len(d1) < len(d2):
        return dotProduct(d2, d1)
    else:
        return sum(d1.get(f, 0) * v for f, v in d2.items())

def increment(d1, scale, d2):
    """
    Implements d1 += scale * d2 for sparse vectors.
    @param dict d1: the feature vector which is mutated.
    @param float scale
    @param dict d2: a feature vector.

    NOTE: This function does not return anything, but rather
    increments d1 in place. We do this because it is much faster to
    change elements of d1 in place than to build a new dictionary and
    return it.
    """
    for f, v in d2.items():
        d1[f] = d1.get(f, 0) + v * scale

def pegasos_sw(lbd,X,y,epoch):
    
    w = {}
    s = 1
    t = 0
    for e in range(epoch):
        temp = {}
        for j in range(len(y)):
            t += 1
            elta = 1/(t*lbd)
            d = X[j]
           
            for k in d.keys():
                if k not in w:
                    w[k] = 0

            result = s*y[j]*dotProduct(w, d)
            s = s * (1 - elta * lbd)
            
            if s == 0: s = 1
            if result < 1:
                increment(w, (1/s)*elta*y[j], d)
        
            #print({k: s*w[k] for k in list(w.keys())[:3]})
            
    for f, v in w.items():
        w[f] = v * s
            
    return w

def classification_error(w,X,y):
    loss = 0
    for j in range(len(y)):
        d = bag_of_words(X[j])
        result = dotProduct(w, d)
       
        if result > 0 and y[j] == -1:
            loss += 1
        elif result < 0 and y[j] == 1:
            loss += 1
    return loss/len(y)



In [None]:
def train_test_split(data,target):
    X_train = []
    y_train = []
    for i in range(len(data)):
        X_train.append(bag_of_words(data[i]))
        if target[i] == 0:
            y_train.append(-1)
        else:
            y_train.append(1)
        
    return X_train, y_train

In [None]:
result = []
for index, tweet in enumerate(train_df['text']):
    tweet = tweet_cleaner(tweet)
    result.append(tweet)


In [None]:
X_train, y_train = train_test_split(result,train_df['target'])

In [None]:
w = pegasos_sw(0.01,X_train, y_train,10)
classification_error(w,X_train,y_train)

0.1643342662934826

In [None]:
test_result = []
for index, tweet in enumerate(test_df['text']):
    tweet = tweet_cleaner(tweet)
    test_result.append(tweet)

In [None]:
X_test = []
for i in range(len(test_result)):
    X_test.append(bag_of_words(test_result[i]))

In [None]:
prediction = []
for j in range(len(X_test)):
    d = X_test[j]
    result = dotProduct(w, d)
    if result >= 0:
        prediction.append(1)
    else:
        prediction.append(0)

In [None]:
prediction = pd.DataFrame(prediction)
prediction.columns = ['target']

In [None]:
prediction['id'] = test_df['id']

In [None]:
prediction.set_index('id',inplace=True)
prediction

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1
...,...
10861,1
10865,0
10868,1
10874,1


In [None]:
prediction.to_csv('prediction.csv')