In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [129]:
train_df = pd.read_csv("data/train.csv", index_col=0)
test_df = pd.read_csv("data/test.csv", index_col=0)
submission = pd.read_csv("data/sample_submission.csv")

# Preprocessing

In [130]:
train_df.drop(columns=['keyword', 'location'], inplace=True)
test_df.drop(columns=['keyword', 'location'], inplace=True)

In [131]:
train_df.head(2)

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Our Deeds are the Reason of this #earthquake M...,1
4,Forest fire near La Ronge Sask. Canada,1


In [132]:
# Here Come ChatWords Which i Get from a Github Repository
# Repository Link : https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    #"KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    #"LMAO": "Laugh My A.. Off",
    #"LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    #"TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [133]:
import string

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import re

def preprocessing(df):
    # lower
    df['text'] = df['text'].str.lower()
    
    # erasing punctuations
    df['text'] = df['text'].str.translate(str.maketrans('', '', string.punctuation))

    # erasing stop_words
    stop_words = stopwords.words('english')
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

    # remove HTML tags
    def remove_tags(text):
        return re.sub('<[^<]+?>', '', text)

    df['text'] = df['text'].apply(remove_tags)

    # Remove URLs
    df['text'] = df['text'].str.replace('http\S+|www.\S+', '', case=False)

    # Handling ChatsWords
    def chat_conversion(text):
        new_text = []
        for i in text.split():
            if i.upper() in chat_words:
                new_text.append(chat_words[i.upper()].lower())
                #print(f"ChatWord found ! That was '{i}'")

            else:
                new_text.append(i)
        return " ".join(new_text)

    df['text'] = df['text'].apply(chat_conversion)

    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [134]:
train_df = preprocessing(train_df)
test_df = preprocessing(test_df)
train_df.head()

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,deeds reason earthquake may allah forgive us,1
4,forest fire near la ronge sask canada,1
5,residents asked shelter place notified officer...,1
6,13000 people receive wildfires evacuation orde...,1
7,got sent photo ruby alaska smoke wildfires pou...,1


---

## Lemmatizer

In [135]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

train_df['text_lemmatized'] = train_df['text'].apply(lambda x: ' '.join([wordnet_lemmatizer.lemmatize(word , pos='v') for word in x.split()]))
test_df['text_lemmatized'] = test_df['text'].apply(lambda x: ' '.join([wordnet_lemmatizer.lemmatize(word , pos='v') for word in x.split()]))

In [136]:
train_df.head()

Unnamed: 0_level_0,text,target,text_lemmatized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,deeds reason earthquake may allah forgive us,1,deeds reason earthquake may allah forgive us
4,forest fire near la ronge sask canada,1,forest fire near la ronge sask canada
5,residents asked shelter place notified officer...,1,residents ask shelter place notify officer eva...
6,13000 people receive wildfires evacuation orde...,1,13000 people receive wildfires evacuation orde...
7,got sent photo ruby alaska smoke wildfires pou...,1,get send photo ruby alaska smoke wildfires pou...


---

## Model

In [137]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
count_vectorizer = feature_extraction.text.CountVectorizer()

train_vectors = count_vectorizer.fit_transform(train_df["text_lemmatized"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text_lemmatized"])

In [138]:
train_vectors.todense().shape

(7613, 20452)

In [139]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [140]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.58333333, 0.53732859, 0.59714707])

---