In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import re
from tqdm import tqdm
%matplotlib inline

In [2]:
import chardet
with open("Corona_NLP_train.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}

In [3]:
train_df = pd.read_csv("Corona_NLP_train.csv", encoding = "latin1")
test_df = pd.read_csv("Corona_NLP_test.csv", encoding = "latin1")

In [9]:
def preprocess(df):
    """This function accepts a pandas dataframe containing a column of OriginalTweets and output the processed dataframe"""
    
    #Convert to lower
    df["OriginalTweet"] = df["OriginalTweet"].apply(str.lower)
    #extract hasgtags and tags, join them by empty space and store them
    def extract_hashtags(s):
        hashtags = re.findall(r"#(\w+)", s)
        ats = re.findall(r"@(\w+)", s)
        return hashtags, ats
    
    tweets = df["OriginalTweet"].values
    hashtags = []
    tags = []
    for tweet in tweets:
        h,a = extract_hashtags(tweet)
        hashtags.append(" ".join(h))
        tags.append(" ".join(a))
    
    df["hashtags"] = hashtags
    df["tags"] = tags
    # Function to decontract tweets
    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won't", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    
    #Basic preprocessing
    from tqdm import tqdm
    def preprocess_text(text_data):
        preprocessed_text = []
        # tqdm is for printing the status bar
        for sentence in text_data:
            if type(sentence) != str:
                sent = ' '
            else:
                sent = re.sub(r'http\S+', '', sentence) #remove url
                sent = decontracted(sent) #decontract
                sent = sent.replace('\\r', ' ') 
                sent = sent.replace('\\n', ' ')
                sent = sent.replace('\\"', ' ')
                sent = re.sub('[^A-Za-z0-9]+', ' ', sent) #remove anythong non-alphanumeric

            # https://gist.github.com/sebleier/554280
            sent = ' '.join(e for e in sent.split())
            preprocessed_text.append(sent.lower().strip())
        return preprocessed_text
    
    preprocessed_tweets = preprocess_text(tweets)
    
    df["preprocessed tweets"] = preprocessed_tweets
    return df
    

    

In [16]:
train_preprocessed = preprocess(pd.DataFrame(train_df[["OriginalTweet", "Sentiment"]]))

In [18]:
test_preprocessed = preprocess(pd.DataFrame(test_df[["OriginalTweet", "Sentiment"]]))

In [20]:
train_preprocessed.head()

Unnamed: 0,OriginalTweet,Sentiment,hashtags,tags,preprocessed tweets
0,@menyrbie @phil_gahan @chrisitv https://t.co/i...,Neutral,,menyrbie phil_gahan chrisitv,menyrbie phil gahan chrisitv and and
1,advice talk to your neighbours family to excha...,Positive,,,advice talk to your neighbours family to excha...
2,coronavirus australia: woolworths to give elde...,Positive,,,coronavirus australia woolworths to give elder...
3,my food stock is not the only one which is emp...,Positive,covid19france covid_19 covid19 coronavirus con...,,my food stock is not the only one which is emp...
4,"me, ready to go at supermarket during the #cov...",Extremely Negative,covid19 coronavirus coronavirusfrance restezch...,,me ready to go at supermarket during the covid...


In [21]:
test_preprocessed.head()

Unnamed: 0,OriginalTweet,Sentiment,hashtags,tags,preprocessed tweets
0,trending: new yorkers encounter empty supermar...,Extremely Negative,coronavirus,,trending new yorkers encounter empty supermark...
1,when i couldn't find hand sanitizer at fred me...,Positive,amazon coronavirus,,when i could not find hand sanitizer at fred m...
2,find out how you can protect yourself and love...,Extremely Positive,coronavirus,,find out how you can protect yourself and love...
3,#panic buying hits #newyork city as anxious sh...,Negative,panic newyork healthcare bigapple coronavirus ...,,panic buying hits newyork city as anxious shop...
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,toiletpaper dunnypaper coronavirus coronavirus...,,toiletpaper dunnypaper coronavirus coronavirus...
