In [30]:
import pandas as pd
from sklearn import feature_extraction, linear_model,  model_selection, preprocessing
pd.set_option('display.max_columns', None)

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

### Utils

In [3]:
def remove_ambiguous_labels(df):
    df['target_relabeled'] = df['target'].copy()

    df.loc[df[
               'text'] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit', 'target_relabeled'] = 0
    df.loc[df[
               'text'] == 'Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife', 'target_relabeled'] = 0
    df.loc[df['text'] == 'To fight bioterrorism sir.', 'target_relabeled'] = 0
    df.loc[df[
               'text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target_relabeled'] = 1
    df.loc[df[
               'text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring', 'target_relabeled'] = 1
    df.loc[df[
               'text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption', 'target_relabeled'] = 0
    df.loc[df[
               'text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!', 'target_relabeled'] = 0
    df.loc[df[
               'text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target_relabeled'] = 1
    df.loc[df[
               'text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG', 'target_relabeled'] = 1
    df.loc[df[
               'text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...", 'target_relabeled'] = 0
    df.loc[
        df['text'] == "wowo--=== 12000 Nigerian refugees repatriated from Cameroon", 'target_relabeled'] = 0
    df.loc[df[
               'text'] == "He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam", 'target_relabeled'] = 0
    df.loc[df[
               'text'] == "Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do anything that leads to it #islam!", 'target_relabeled'] = 0
    df.loc[df[
               'text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'", 'target_relabeled'] = 0
    df.loc[df['text'] == "Caution: breathing may be hazardous to your health.", 'target_relabeled'] = 1
    df.loc[df[
               'text'] == "I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????", 'target_relabeled'] = 0
    df.loc[df[
               'text'] == "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect", 'target_relabeled'] = 0
    df.loc[df[
               'text'] == "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time", 'target_relabeled'] = 0

    df['target'] = df['target_relabeled'].copy()
    df = df.drop(columns='target_relabeled')
    return df


### Understanding the data

In [4]:
print("Number of Disaster tweets")
print(df_train.loc[df_train['target'] == 1].shape[0])

print("Number of Non-Disaster tweets")
print(df_train.loc[df_train['target'] == 0].shape[0])

print("Number of Mislabelled tweets")
print(df_train.loc[df_train['target'] >1 ].shape[0])

print("# of ambiguously labelled tweets")
df_mislabeled = df_train.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]
print(df_mislabeled.head(5))



Number of Disaster tweets
3271
Number of Non-Disaster tweets
4342
Number of Mislabelled tweets
0
# of ambiguously labelled tweets
                                                    id  keyword  location  \
text                                                                        
like for the music video I want some real actio...   2        2         1   
Hellfire! We donÛªt even want to think about i...   2        1         1   
The Prophet (peace be upon him) said 'Save your...   6        1         1   
In #islam saving a person is equal in reward to...   2        1         2   
To fight bioterrorism sir.                           4        1         0   

                                                    target  
text                                                        
like for the music video I want some real actio...       2  
Hellfire! We donÛªt even want to think about i...       2  
The Prophet (peace be upon him) said 'Save your...       2  
In #islam saving a person

### Processing data

In [6]:
df_train = remove_ambiguous_labels(df_train)

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [10]:
values = {"keyword":"", "location":""}
df_train = df_train.fillna(value=values)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7613 non-null   object
 2   location  7613 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [16]:
count_vectorizer = feature_extraction.text.CountVectorizer()
train_vectors = count_vectorizer.fit_transform(df_train["text"])
test_vectors = count_vectorizer.transform(df_test["text"])

In [17]:
train_vectors.shape

(7613, 21637)

In [20]:
df_train.loc[df_train['keyword']!=""].head(5)

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [28]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [22]:
clf = linear_model.RidgeClassifier()

In [33]:
scores = model_selection.cross_val_score(clf, train_vectors, df_train["target"], cv=3, scoring="f1")
scores

array([0.5941873 , 0.56710591, 0.64373464])

In [34]:
sample_submission = pd.read_csv("data/sample_submission.csv")

In [35]:
sample_submission["target"] = clf.predict(test_vectors)

In [36]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [37]:
sample_submission.to_csv("data/submission.csv", index=False)