<a href="https://colab.research.google.com/github/axiom9/WebScraping_and_NLP/blob/main/NLP_Tasks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [107]:
# https://towardsdatascience.com/deep-learning-pipeline-for-natural-language-processing-nlp-c6f4074897bb

# https://tfhub.dev/google/nnlm-en-dim50/2

# https://medium.com/analytics-vidhya/introduction-to-nlp-with-disaster-tweets-3b672a75748c

# Pre-reqs (Loading data)

In [108]:
import pandas as pd

In [109]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [110]:
def load_data():
  ! mkdir ~/.kaggle
  ! cp kaggle.json ~/.kaggle/
  ! chmod 600 ~/.kaggle/kaggle.json
  ! kaggle competitions download -c nlp-getting-started

In [111]:
ls

[0m[01;34msample_data[0m/  sample_submission.csv  test.csv  train.csv


In [112]:
load_data()

mkdir: cannot create directory ‘/root/.kaggle’: File exists
cp: cannot stat 'kaggle.json': No such file or directory
test.csv: Skipping, found more recently modified local copy (use --force to force download)
sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
train.csv: Skipping, found more recently modified local copy (use --force to force download)


In [113]:
df = pd.read_csv('/content/sample_data/train.csv')

# Pre-processing Data

### Imports

In [252]:
import nltk
# nltk.download('words')
from nltk.corpus import words, stopwords
from nltk.stem import WordNetLemmatizer

from langdetect import detect

import re

import string
from string import digits

In [115]:
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [116]:
df.isna().sum() / len(df)

id          0.000000
keyword     0.008013
location    0.332720
text        0.000000
target      0.000000
dtype: float64

## Now we can begin pre-processing the sentences:

* Drop duplicates if any
* Drop any columns that might not be needed
* Drop missing values if any
* Remove punctuations
* Convert the words into lowercase
* Remove URLs, the word “twitter” and other acronyms
* Extract only tweets that are in English
* Tokenize (break the tweets into single words)
* Remove stopwords


In [117]:
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [118]:
# df2copy = df.text
# df2copy.apply(lambda sentences: [sentence for x in sentences if x in words.words()])

In [260]:
words = set(nltk.corpus.words.words())
try:
  stop_words = set(stopwords.words('english'))
except LookupError:
  nltk.download('stopwords')
finally:
  stop_words = set(stopwords.words('english'))

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [275]:
def clean_data_1(df):
    """Function that drops unwanted columns, duplicates, rows w/ missing tweets and removes punctuations"""

    # Dictionary to expand contractions (will be used later)
    contractions_dict = {"ain't": "are not","'s":" is","aren't": "are not", 
                     "shouldn't":"should not", "wouldn't":"would not", "couldn't":"could not",
                     "they're": "they are", "he'd":"he would", "I'm": "I am", "he's":"he is",
                     "isn't": "is not"}

    #Function to remove digits in text                    
    def remove_nums(text):
        """Removes any text (word) containing digits in it and properly concatenates
        the remaining sentence"""
        return " ".join(re.sub(r'\w*\d\w*', '', text).strip().split())

    #Function to remove URLs
    def remove_URL(text):
        """Remove URLs from a text string"""
        return re.sub(r"http\S+", "", text)

    # Function to expand contractions based on the contractions_dict
    def expand_contractions(text,contractions_dict=contractions_dict):
        def replace(match):
            return contractions_dict[match.group(0)]
        return contractions_re.sub(replace, text)

    # Function to remove ascii characters
    def remove_ascii(text):
        """Removes any ascii characters"""
        return text.encode("ascii", "ignore").decode()

    # Function to remove stop words
    def remove_stopwords(text):
        """Removes stopwords from a corpus"""
        return " ".join([word for word in str(text).split() if word not in stop_words])

    # Function to lemmatize words
    def lemmatize_words(text):
        """ Uses NLTK wordnet lemmatizer to lemmatize words """
        return " ".join([lemmatizer.lemmatize(word) for word in text.split()])


    # def detect_languages(df):
    #     """Function that adds a new column in the dataframe passed that is the language the text is in"""
    #     # Check if the language is en, if it is do nothing, if it isn't then drop it
    #     for i in range(len(df.text)):
    #         if detect(df.text[i]) == 'en':
    #             df.drop(i, inplace=True)

    #Detect and drop rows that aren't in English - Temporarily excluding
    # detect_languages(df)

    # Make a copy of the df
    df = df.copy()

    # Drop columns that aren't needed
    cols_to_drop = ['id', 'keyword', 'location']
    df = df.drop(cols_to_drop, axis=1)
    # Note: We only need 'text' and 'target' columns for this deep learning task. Other columns can be effectively discarded

    # Regular expression for finding contractions
    contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

    # Expanding Contractions in the reviews
    df.text=df.text.apply(lambda x:expand_contractions(x))

    # Remove URLs
    df.text = df.text.apply(lambda x: remove_URL(x))

    # Remove ascii characters / text
    df.text = df.text.apply(lambda x:remove_ascii(x))

    # Removing punctutations
    df.text = df.text.apply(lambda x: x.translate(str.maketrans(' ',' ', string.punctuation)))

    # Removing numbers
    # df.text = df.text.apply(lambda x: x.translate(str.maketrans(' ',' ', digits)))
    df.text = df.text.apply(lambda x: remove_nums(x))

    # Lowercase
    df.text = df.text.apply(lambda x: x.lower())

    # Drop duplicates
    df.drop_duplicates(inplace=True) 

    # Drop any rows with missing tweets
    df.text.dropna(inplace=True)

    # Remove stop-words
    df.text = df.text.apply(lambda x: remove_stopwords(x))

    # Get the lemma of the words
    df.text = df.text.apply(lambda x: lemmatize_words(x))

    # Function to remove any extra unwanted spaces
    df.text = df.text.apply(lambda x: re.sub(' +', ' ', x))

    return df

In [276]:
df2 = df.copy()
df2 = clean_data_1(df2)
df2

Unnamed: 0,text,target
0,deed reason earthquake may allah forgive u,1
1,forest fire near la ronge sask canada,1
2,resident asked ishelter place notified officer...,1
3,people receive wildfire evacuation order calif...,1
4,got sent photo ruby alaska smoke wildfire pour...,1
...,...,...
7601,breaking la refugio oil spill may costlier big...,1
7602,siren went wasnt forney tornado warning,1
7603,official say quarantine place alabama home pos...,1
7605,flip side walmart bomb everyone evacuate stay ...,1


In [251]:
df2.text[500:1000]

571     lonepine remembered around australia descendan...
572     miss gary busey son plays dixie electronic gre...
573     news fedex longer transport bioterror germs wa...
574     fedex longer transport bioterror germs via usa...
575     usa today fedex longer transport bioterror pat...
                              ...                        
1088        hatchetwielding gunman pepper spray fake bomb
1089      oops h bomb lost miles okinawan coast fell ship
1091    mf life vocal lyrical bomb saw live summer ama...
1092                                       ehutch da bomb
1093    jays rocking mlb bombed one rogers centre play...
Name: text, Length: 500, dtype: object

In [277]:
def tokenize(df):
    """ This function continues the cleaning process and removes numbers, urls, 
    gibberish characters and performs tokenization / lemmatization"""

    # Make a copy of the df
    df = df.copy()

    #Tokenizing




In [None]:
df3 = df2.copy()
df3 = tokenize(df3)
df3