#### ***Session 1 : Import Libraries and Load Data***

In [81]:
import pandas as pd 
pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',500)
import numpy as np                    
from xgboost import XGBRegressor 
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,accuracy_score,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

In [62]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

#### ***Session 2 : Basic Data Exploration***

In [63]:
train.iloc[:5]

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [64]:
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [65]:
train.shape

(7613, 5)

In [66]:
train.duplicated().sum()

0

#### ***Session 3 : Data Cleaning***

*i) Transform the text column*

**Removing unnecessary punctuation marks,making the text lower and links**

In [67]:
import re 
import string

def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)   
    return text

train['text']=train['text'].apply(wordopt)
test['text']=test['text'].apply(wordopt)

In [68]:
train.iloc[:5]

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1


*ii) Select Required Features*

In [69]:
train.drop(columns=['id','keyword','location'],axis=1,inplace=True)
test.drop(columns=['id','keyword','location'],axis=1,inplace=True)

In [70]:
train.iloc[:10]

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,just got sent this photo from ruby alaska as s...,1
5,rockyfire update california hwy 20 closed in ...,1
6,flood disaster heavy rain causes flash floodin...,1
7,im on top of the hill and i can see a fire in ...,1
8,theres an emergency evacuation happening now i...,1
9,im afraid that the tornado is coming to our area,1


In [71]:
print(f'Initial Number of Duplicates : {train.duplicated().sum()}')
train.drop_duplicates(inplace=True,ignore_index=True)
print(f'Final Number of Duplicates : {train.duplicated().sum()}')

Initial Number of Duplicates : 591
Final Number of Duplicates : 0


#### ***Session 4 : Model Building and Training***

In [72]:
X=train['text']
y=train['target']
X_test=test['text']

In [73]:
vectorizer=TfidfVectorizer(stop_words='english')
X=vectorizer.fit_transform(X)
X_test=vectorizer.transform(X_test)

In [74]:
model=SVC()
model.fit(X,y)

#### ***Session 5 : Prediction and Saving to csv for Submission***

In [75]:
prediction=model.predict(X_test)

In [77]:
test['predicted']=prediction

In [84]:
sample=pd.read_csv('sample_submission.csv')
sample.drop(['target'],axis=1,inplace=True)
sample['target']=prediction 

In [88]:
test.iloc[-50:]

Unnamed: 0,text,predicted
3213,and the salt in my wounds isnt burnin any more...,0
3214,when and how does a character recover fromåêwo...,0
3215,what they dentists dont tell u is how much ur ...,0
3216,our wounds can so easily turn us into people w...,0
3217,im sitting in the parking lot waiting to go in...,0
3218,last week we had a blast hosting dinner amp a ...,0
3219,now that iûªve figured out how to get my musi...,0
3220,im an emotional wreck watching emmerdale fml,0
3221,interesting in watching a train wreck while ta...,1
3222,dukeskywalker facialabuse you should do a comp...,0


In [87]:
#sample.to_csv('sample_submission1.csv',index=False)