In [40]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [41]:
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_data.shape

(7613, 5)

In [42]:
test_data = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_data.shape

(3263, 4)

In [43]:
print('Training Data')
print(train_data.head(5))
print('-----------------------------------------------------------------------------------')
print('Test Data')
print(test_data.head(5))

Training Data
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
-----------------------------------------------------------------------------------
Test Data
   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokan

# Text Preprocessing - Pipeline
* CountVectorizer -Text to token integer counts
* TF-IDF transformer - integer to TFIDF scores
* Classifier - train TFIDF  vectors with Logistic Regression


In [44]:
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import  TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [45]:
def process_text(text):
    """
    Removes punctuations(if any), stopwords and returns a list words
    """
    rm_pun = [char for char in text if char not in string.punctuation]
    rm_pun = ''.join(rm_pun)
    
    return [word for word in rm_pun.split() if word.lower() not in stopwords.words('english')]

# Logistic Regression Classifier

In [None]:
pipeline = Pipeline([
    ('cv', CountVectorizer(analyzer=process_text)),  
    ('tfidf', TfidfTransformer()),  
    ('classifier', LogisticRegression()),
])

# Support Vector Machine Classifier

In [None]:
pipeline = Pipeline([
    ('cv', CountVectorizer(analyzer=process_text)),  
    ('tfidf', TfidfTransformer()),  
    ('classifier', SVC()),
])

# RandomForest Classifier

In [46]:
pipeline = Pipeline([
    ('cv', CountVectorizer(analyzer=process_text)),  
    ('tfidf', TfidfTransformer()),  
    ('classifier', RandomForestClassifier(n_estimators=600)),
])

In [47]:
pipeline.fit(train_data['text'],train_data['target'])

Pipeline(steps=[('cv',
                 CountVectorizer(analyzer=<function process_text at 0x7f5557c43710>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', RandomForestClassifier(n_estimators=600))])

# Test Data Prediction

In [48]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [49]:
sample_submission['target'] = pipeline.predict(test_data['text'])

# File Submission

In [50]:
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [51]:
sample_submission.to_csv("submission.csv", index=False)