# NLP Getting Started

https://www.kaggle.com/competitions/nlp-getting-started

## Setup

In [2]:
import os
import glob
from pathlib import Path
import random
from PIL import Image

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [3]:
IS_KAGGLE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

COMP_NAME = 'nlp-getting-started'
if COMP_NAME is None:
    raise NameError('COMP_NAME has not been initialized')

DATA_PATH = Path('../input/' + COMP_NAME) if IS_KAGGLE else Path('./data')

RANDOM_SEED = 42

In [4]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

## Load Data

In [5]:
path = Path('./data')
if not DATA_PATH.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(COMP_NAME)
    zipfile.ZipFile(f'{COMP_NAME}.zip').extractall(DATA_PATH)

In [5]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
test_df = pd.read_csv(DATA_PATH/'test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## Data Exploration

In [7]:
train_df[train_df['target'] == 0]['text'].values[1]

'I love fruits'

In [8]:
train_df[train_df['target'] == 1]['text'].values[1]

'Forest fire near La Ronge Sask. Canada'

## Prepare Training/Validation Data

In [9]:
count_vectorizer = feature_extraction.text.CountVectorizer()

example_train_vectors = count_vectorizer.fit_transform(train_df['text'][0:5])

print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [10]:
train_vectors = count_vectorizer.fit_transform(train_df['text'])
test_vectors = count_vectorizer.transform(test_df['text'])

## Train Model

In [12]:
model = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(model, train_vectors, train_df['target'], cv=3, scoring='f1')

print(scores)

[0.59453669 0.5642787  0.64082434]


In [13]:
model.fit(train_vectors, train_df['target'])

## Submission

In [15]:
submission = pd.read_csv(DATA_PATH/'sample_submission.csv')
submission['target'] = model.predict(test_vectors)
submission.to_csv('submission.csv', index=False)