## Get data

In [None]:
# Only use full for Google colab
try:
    import google.colab
    print("Running on Google Colab")
    import subprocess

    def run_command(command):
        """Runs a command line command."""
        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
        output, err = process.communicate()
        return output.decode("utf-8")

    run_command("wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip -d sample_data")
    run_command("unzip nlp_getting_started.zip -d sample_data/nlp_getting_started")
    run_command("python -m spacy download en_core_web_sm")  # download english model for spacy
    run_command("wget https://raw.githubusercontent.com/anilkumarKanasani/tensorflow-deep-learning/main/Notebooks/helper_functions.py")
    print("Data Import completed")

except:
    print("Not running on Google Colab")

## Explore Data

In [None]:
import numpy as np
import pandas as pd
df_train = pd.read_csv("./sample_data/nlp_getting_started/train.csv")
df_test = pd.read_csv("./sample_data/nlp_getting_started/test.csv")

# convert all NaN values to string
df_train['keyword'] = df_train['keyword'].fillna('no_keyword')
df_train['location'] = df_train['location'].fillna('no_location')

df_train.head()

In [None]:
# cleaning keywords column
# remove %20 from the keywords
df_train['keyword'] = df_train['keyword'].str.replace('%20', ' ')
df_test['keyword'] = df_test['keyword'].str.replace('%20', ' ')
df_train.head()

In [95]:
# cleaning location column, converting location to possible country names and dropping the location column
import spacy
# Load the pre-trained spaCy model for English
nlp = spacy.load('en_core_web_sm')

df_train['country'] = 'no_country'
df_test['country'] = 'no_country'

# create a new column in data frame with country names extracted from location
for i in range(len(df_train)):
    doc = nlp(df_train['location'][i])
    for ent in reversed(doc.ents):
        if ent.label_ == 'GPE':
            df_train['country'][i] = ent.text
            break
        else:
            df_train['country'][i] = 'no_country'


df_test['country'] = 'no_country'
# create a new column in data frame with country names extracted from location
for i in range(len(df_test)):
    try:
        doc = nlp(df_test['location'][i])
    except:
        df_test['location'][i] = 'no_location'
        doc = nlp(df_test['location'][i])
    for ent in reversed(doc.ents):
        if ent.label_ == 'GPE':
            df_test['country'][i] = ent.text
            break
        else:
            df_test['country'][i] = 'no_country'

# drop the location column from both train and test data frames
df_train = df_train.drop('location', axis=1)
df_test = df_test.drop('location', axis=1)


In [97]:
# saving the cleaned data frames
df_train.to_csv('./sample_data/nlp_getting_started/train_cleaned.csv', index=False)
df_test.to_csv('./sample_data/nlp_getting_started/test_cleaned.csv', index=False)

In [123]:
# loading the cleaned data frames
df_train = pd.read_csv('./sample_data/nlp_getting_started/train_cleaned.csv')
df_test = pd.read_csv('./sample_data/nlp_getting_started/test_cleaned.csv')


### Exploring KeyWords

In [124]:
len(df_train), len(df_test)

(7613, 3263)

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# list of keywords
keywords = list(df_train.keyword.unique())

# number of postive and negative targets for each keyword as a dataframe with row sum as total and column sum as total
df_keywords = pd.DataFrame(columns=['keyword', 'positive', 'negative'])
for keyword in keywords:
    df_keywords = df_keywords.append({'keyword': keyword, 'positive': len(df_train[(df_train['target'] == 1) & (df_train['keyword'] == keyword)]), 'negative': len(df_train[(df_train['target'] == 0) & (df_train['keyword'] == keyword)])}, ignore_index=True)

# row sum and column sum
df_keywords.loc['Total'] = df_keywords.sum()
df_keywords['Total'] = df_keywords['positive'] + df_keywords['negative']

df_keywords = df_keywords.sort_values(by='Total', ascending=True)

# drop the index except the Total row
df_keywords['keyword']['Total'] = 'Total'
df_keywords = df_keywords.reset_index(drop=True)

df_keywords

### Exploring location

In [126]:
# convert all countries if it has total count only 1 tweet to no_country
df_train['country'] = df_train['country'].apply(lambda x: 'no_country' if df_train['country'].value_counts()[x] == 1 else x)
df_test['country'] = df_test['country'].apply(lambda x: 'no_country' if df_test['country'].value_counts()[x] == 1 else x)

In [None]:
# same as above but for countries
countries = list(df_train.country.unique())

df_countries = pd.DataFrame(columns=['country', 'positive', 'negative'])
for country in countries:
    df_countries = df_countries.append({'country': country, 'positive': len(df_train[(df_train['target'] == 1) & (df_train['country'] == country)]), 'negative': len(df_train[(df_train['target'] == 0) & (df_train['country'] == country)])}, ignore_index=True)

df_countries.loc['Total'] = df_countries.sum()
df_countries['Total'] = df_countries['positive'] + df_countries['negative']

df_countries = df_countries.sort_values(by='Total', ascending=True)

# drop the index except the Total row
df_countries['country']['Total'] = 'Total'
df_countries = df_countries.reset_index(drop=True)

df_countries


### Exploring the text

In [None]:
# extract the url from the text, clean url from the text
import re
def extract_url(text):
    url = re.findall(r'https?://\S+|www\.\S+', text)
    return url

# clean the url from the text
def clean_url(text):
    url = extract_url(text)
    for i in url:
        text = text.replace(i, '')
    return text

# clean the url from the text
df_train['text'] = df_train['text'].apply(lambda x: clean_url(x))
df_test['text'] = df_test['text'].apply(lambda x: clean_url(x))

df_train

In [129]:
# do the same for hashtags, remove # from the hashtag column items
def extract_hashtag(text):
    hashtag = re.findall(r'#\S+', text)
    return hashtag

# create a new column with the url
df_train['hashtag'] = df_train['text'].apply(lambda x: extract_hashtag(x))
df_test['hashtag'] = df_test['text'].apply(lambda x: extract_hashtag(x))

# clean the hashtag from the text
def clean_hashtag(text):
    hashtag = extract_hashtag(text)
    for i in hashtag:
        text = text.replace(i, '')
    return text

# clean the hashtag from the text
df_train['text'] = df_train['text'].apply(lambda x: clean_hashtag(x))
df_test['text'] = df_test['text'].apply(lambda x: clean_hashtag(x))

# remove the # from the hashtag column items
def remove_hashtag(list_of_text):
    removed_list = []
    for text in list_of_text:
        text = text.replace('#', '')
        removed_list.append(text)
    return removed_list

# remove the # from the hashtag column items
df_train['hashtag'] = df_train['hashtag'].apply(lambda x: remove_hashtag(x))
df_test['hashtag'] = df_test['hashtag'].apply(lambda x: remove_hashtag(x))

df_train

Unnamed: 0,id,keyword,text,target,country,hashtag
0,1,no_keyword,Our Deeds are the Reason of this May ALLAH Fo...,1,no_country,[earthquake]
1,4,no_keyword,Forest fire near La Ronge Sask. Canada,1,no_country,[]
2,5,no_keyword,All residents asked to 'shelter in place' are ...,1,no_country,[]
3,6,no_keyword,"13,000 people receive evacuation orders in Ca...",1,no_country,[wildfires]
4,7,no_keyword,Just got sent this photo from Ruby as smoke f...,1,no_country,"[Alaska, wildfires]"
5,8,no_keyword,Update => California Hwy. 20 closed in both d...,1,no_country,"[RockyFire, CAfire, wildfires]"
6,10,no_keyword,Heavy rain causes flash flooding of streets ...,1,no_country,"[flood, disaster]"
7,13,no_keyword,I'm on top of the hill and I can see a fire in...,1,no_country,[]
8,14,no_keyword,There's an emergency evacuation happening now ...,1,no_country,[]
9,15,no_keyword,I'm afraid that the tornado is coming to our a...,1,no_country,[]
