# Extract

In [1]:
import numpy as np # linear algebra
import pandas as pd

train =  pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [2]:
train.head()

Unnamed: 0,TweetId,Label,TweetText
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...
1,304834304222064640,Politics,'@rraina1481 I fear so'
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...


# Transform and Feature Engineering

## Cleaning Data

In [3]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import string

#### Using one function, we will clean our Tweet texts, and retrieve the words constituting them

In [4]:
def clean(text):
    
    punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~'
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # remove urls
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    
    words = [w for w in text.split(" ") if len(w)>1]
    # remove usernames
    words = [w for w in words if w[0] is not "@"]
    
    # remove tags
    words = [w for w in words if w[0] is not "#"]
    
    # remove stopwords
    words = [w for w in words if w not in stop_words]
    
    # remove duplicates
    words = set(words)
    
    # lemmatization
    words = [lemmatizer.lemmatize(w) for w in words if len(w)>1]
    
    return words

In [5]:
def preprocess(x):
    
    x = x.apply(lambda x: clean(x))
    
    # join the words in a sentence
    y = x.copy(deep=True)
    for i in range(y.shape[0]):
        y[i] = " ".join(x[i])
        
    return y

In [6]:
train["TweetText"] = preprocess(train["TweetText"])
train.head()

Unnamed: 0,TweetId,Label,TweetText
0,304271250237304833,Politics,dollar value deepest value seckerry american s...
1,304834304222064640,Politics,rraina1481 fear
2,303568995880144898,Sports,west video final wwc13 watch highlight indie h...
3,304366580664528896,Sports,nitro circus albertpark theymakeitlooksoeasy c...
4,296770931098009601,Sports,good thanks feedback thing always cricketfox


In [7]:
test["TweetText"] = preprocess(test["TweetText"])
test.head()

Unnamed: 0,TweetId,TweetText
0,306486520121012224,home stab bennett wide get threaten 28 throw y...
1,286353402605228032,series httptcoygjepjkf mass pound avoirdupois ...
2,289531046037438464,sochi2014 construction shore along httptco8dvi...
3,306451661403062273,httptco4qx0fhypmp foreign video transcript htt...
4,297941800658812928,player first usd ricky go 400000 iplauction in...


# Load

In [8]:
train.to_csv("./data/trainP.csv", index=False)
test.to_csv("./data/testP.csv", index=False)