In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [66]:
train_df = pd.read_csv("data/train.csv", index_col=0)
test_df = pd.read_csv("data/test.csv", index_col=0)
submission = pd.read_csv("data/sample_submission.csv")

# Preprocessing

In [67]:
train_df.drop(columns=['keyword', 'location'], inplace=True)
test_df.drop(columns=['keyword', 'location'], inplace=True)

In [68]:
train_df.head(2)

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Our Deeds are the Reason of this #earthquake M...,1
4,Forest fire near La Ronge Sask. Canada,1


In [69]:
# Here Come ChatWords Which i Get from a Github Repository
# Repository Link : https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    #"KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    #"LMAO": "Laugh My A.. Off",
    #"LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    #"TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [70]:
from textblob import TextBlob

import string

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import re

import emoji

def preprocessing(df):
    # duplicates
    # df.drop_duplicates(subset="text", keep='first', inplace=True) # there are also duplicates in training set, drop_duplicates make the loss higher
    
    # lower
    df['text'] = df['text'].str.lower()
    
    # Remove URLs
    df['text'] = df['text'].str.replace(r'http\S+|www\.\S+|https\S+', ' ', case=False, regex=True)
    
    # erasing punctuations
    df['text'] = df['text'].str.translate(str.maketrans('', '', string.punctuation))

    # Spelling issue
    def Spelling_Correction(text):
        text_blob = TextBlob(text)
        #print('done')
        return text_blob.correct().string

    #df['text_blob'] = df['text'].apply(Spelling_Correction)

    # erasing stop_words
    stop_words = stopwords.words('english')
    #df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

    # remove HTML tags
    def remove_tags(text):
        return re.sub('<[^<]+?>', ' ', text)

    df['text'] = df['text'].apply(remove_tags)

    # Handling ChatsWords
    def chat_conversion(text):
        new_text = []
        for i in text.split():
            if i.upper() in chat_words:
                new_text.append(chat_words[i.upper()].lower())
                #print(f"ChatWord found ! That was '{i}'")

            else:
                new_text.append(i)
        return " ".join(new_text)

    df['text'] = df['text'].apply(chat_conversion)

    # Handling Emojis

    df['text'] = df['text'].apply(lambda x: emoji.demojize(x))
    df['text'] = df['text'].apply(lambda x: x.replace(":copyright:",""))

    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [71]:
train_df = preprocessing(train_df)
test_df = preprocessing(test_df)
train_df.head()

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,our deeds are the reason of this earthquake ma...,1
4,forest fire near la ronge sask canada,1
5,all residents asked to shelter in place are be...,1
6,13000 people receive wildfires evacuation orde...,1
7,just got sent this photo from ruby alaska as s...,1


In [72]:
#train_df.loc[train_df['text'] != train_df['text_blob'], ["text", "text_blob", "target"]]

In [73]:
#print(train_df.loc[train_df['text'] != train_df['text_emoji'], ["text", "text_emoji"]]["text"][3123])
#print(train_df.loc[train_df['text'] != train_df['text_emoji'], ["text", "text_emoji"]]["text_emoji"][3123])

love food fun malaysian prime minister najib razak confirmed that the aircraft debris found on rì©union isla

love food fun malaysian prime minister najib razak confirmed that the aircraft debris found on rì:copyright:union isla

---

In [74]:
# Assurez-vous de télécharger les ressources nécessaires
nltk.download('punkt')

# Fonction pour tokenizer le texte
def tokenize_text(text):
    return nltk.word_tokenize(text)

# Appliquer le tokenizer à chaque observation
train_df['text_tokenized'] = train_df['text'].apply(tokenize_text)
train_df['text_tokenized'] = train_df['text_tokenized'].apply(lambda x: ' '.join([word for word in x]))

test_df['text_tokenized'] = test_df['text'].apply(tokenize_text)
test_df['text_tokenized'] = test_df['text_tokenized'].apply(lambda x: ' '.join([word for word in x]))

train_df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0_level_0,text,target,text_tokenized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,our deeds are the reason of this earthquake ma...,1,our deeds are the reason of this earthquake ma...
4,forest fire near la ronge sask canada,1,forest fire near la ronge sask canada
5,all residents asked to shelter in place are be...,1,all residents asked to shelter in place are be...
6,13000 people receive wildfires evacuation orde...,1,13000 people receive wildfires evacuation orde...
7,just got sent this photo from ruby alaska as s...,1,just got sent this photo from ruby alaska as s...


---

## Lemmatizer

In [75]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

train_df['text_lemmatized'] = train_df['text_tokenized'].apply(lambda x: ' '.join([wordnet_lemmatizer.lemmatize(word , pos='v') for word in x.split()]))
test_df['text_lemmatized'] = test_df['text_tokenized'].apply(lambda x: ' '.join([wordnet_lemmatizer.lemmatize(word , pos='v') for word in x.split()]))

In [76]:
train_df.head()

Unnamed: 0_level_0,text,target,text_tokenized,text_lemmatized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,our deeds are the reason of this earthquake ma...,1,our deeds are the reason of this earthquake ma...,our deeds be the reason of this earthquake may...
4,forest fire near la ronge sask canada,1,forest fire near la ronge sask canada,forest fire near la ronge sask canada
5,all residents asked to shelter in place are be...,1,all residents asked to shelter in place are be...,all residents ask to shelter in place be be no...
6,13000 people receive wildfires evacuation orde...,1,13000 people receive wildfires evacuation orde...,13000 people receive wildfires evacuation orde...
7,just got sent this photo from ruby alaska as s...,1,just got sent this photo from ruby alaska as s...,just get send this photo from ruby alaska as s...


---

## Corpus and vocabulary

In [77]:
index_lemmatized = list(train_df.columns).index('text_lemmatized')

In [78]:
corpus = []
for i in range(len(train_df)):
    review = re.sub('[^a-zA-Z]', ' ', train_df.iloc[i, index_lemmatized])
    review = review.split()
    review = ' '.join(review)
    corpus.append(review)

print(f'The Lenght of the Corpus is : {len(corpus)}') # in fact, count of review

The Lenght of the Corpus is : 7613


In [79]:
# Total number of words in corpus
# Initialize total_words counter
total_words = 0

# Iterate through each element in the corpus list
for text in corpus:
    # Split the text into words and update the total_words counter
    total_words += len(text.split())

# Print the total number of words
print(f"Total words in Corpus is : {total_words}")

Total words in Corpus is : 105383


In [80]:
vocabulary = set()

for text in corpus:
    words = text.split()
    vocabulary.update(words)

vocabulary = list(vocabulary)

In [81]:
# Lenght of Vocab
print(f'The Lenght of the Vocabulary  is : {len(vocabulary)}')

The Lenght of the Vocabulary  is : 14967


In [82]:
train_df.iloc[2, index_lemmatized]

'all residents ask to shelter in place be be notify by officer no other evacuation or shelter in place order be expect'

In [83]:
vocabulary[0:10]

['time',
 'infest',
 'curse',
 'zepp',
 'hieroglyphics',
 'deccgovuk',
 'australian',
 'firewise',
 'pardon',
 'breed']

In [84]:
#for idx, element in enumerate(train_df['text_tokenized']):
#    if 'choppergatebronwynbishopauspol' in element:
#        print(idx, element)
#        break
#    else:
#        continue

In [85]:
#for idx, element in enumerate(train_df['text']):
#    if 'muhammad' in element:
#        print(idx, element)
#        break
#    else:
#        continue

---

## Model

In [86]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
count_vectorizer = feature_extraction.text.CountVectorizer()

train_vectors = count_vectorizer.fit_transform(train_df["text_lemmatized"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text_lemmatized"])

In [87]:
train_vectors.todense().shape

(7613, 15915)

In [88]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [89]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=5, scoring="f1")
scores.mean()

0.5828892616588525

---