In [1]:
#Importing necessary files
import pandas as pd
import seaborn as sns

In [2]:
#Import the dataset into dataframeTrain and dataframeTest
#To preserve user identity, the datasets uploaded contain only Tweet IDs, Tweet Text, Tweet Hashtags and attached URLs
dataframeTrain = pd.read_csv('dataset/train.csv')
dataframeTest = pd.read_csv('dataset/test.csv')

In [3]:
#visualizing missing values in the train set
#uncomment to view
#sns.heatmap(dataframeTrain.isnull(), cbar=False)

In [4]:
#visualizing missing values in the test set
#uncomment to view
#sns.heatmap(dataframeTest.isnull(), cbar=False)

In [5]:
#Fill NA in text columns with blank space in columns: text, hashtags, urls
def fillNA(dataset):
    values = {'text':' ', 'hashtags': ' ', 'urls':' '}
    dataset = dataset.fillna(value=values)
    return dataset

In [6]:
#using the function to fill NA values
dataframeTrain = fillNA(dataframeTrain)
dataframeTest = fillNA(dataframeTest)

In [7]:
#Now that we don't have any missing fields, let us clean the strings individually
def datasetCleaner(dataset):
    #removes http format from texts
    dataset.text = dataset.text.str.replace(r'http(\S)+', r'')
    dataset.hashtags = dataset.hashtags.str.replace(r'http ...', r'')
    dataset.urls = dataset.urls.replace(r'http(\S)+', r'')
    #removes RT (retweet)
    dataset.text = dataset.text.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
    dataset.hashtags = dataset.hashtags.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
    dataset.urls = dataset.urls.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
    #removes @username
    dataset.text = dataset.text.str.replace(r'@[\S]+',r'')
    dataset.hashtags = dataset.hashtags.str.replace(r'@[\S]+',r'')
    dataset.urls = dataset.urls.str.replace(r'@[\S]+',r'')
    #replaces double or multiple spaces with single space
    dataset.text = dataset.text.str.replace(r'[ ]{2, }',r' ')
    dataset.hashtags = dataset.hashtags.str.replace(r'[ ]{2, }',r' ')
    dataset.urls = dataset.urls.str.replace(r'[ ]{2, }',r' ')
    #replaces & with and
    dataset.text = dataset.text.str.replace(r'&amp;?',r'and')
    dataset.hashtags = dataset.hashtags.str.replace(r'&amp;?',r'and')
    dataset.urls = dataset.urls.str.replace(r'&amp;?',r'and')
    #replaces the text format of symbols with symbols: <,>
    dataset.text = dataset.text.str.replace(r'&lt;',r'<')
    dataset.text = dataset.text.str.replace(r'&gt;',r'>')
    dataset.hashtags = dataset.hashtags.str.replace(r'&lt;',r'<')
    dataset.hashtags = dataset.hashtags.str.replace(r'&gt;',r'>')
    dataset.urls = dataset.urls.str.replace(r'&lt;',r'<')
    dataset.urls = dataset.urls.str.replace(r'&gt;',r'>')
    #as most of the encoders and vocabulary embeddings are lower cased, we need to lower case our data
    dataset.text = dataset.text.str.lower()
    dataset.hashtags = dataset.hashtags.str.lower()
    dataset.urls = dataset.urls.str.lower()
    return dataset

In [8]:
#use the function to clean the datasets
dataframeTrain = datasetCleaner(dataframeTrain)
dataframeTest = datasetCleaner(dataframeTest)

In [9]:
#transfer contents of the cleaned text columns into a single text column preprocessedText
def createProcessedText(dataset):
    processedText = dataset['text'].astype(str)+ ' ' +dataset['hashtags'].astype(str)+ ' ' +dataset['urls'].astype(str)
    dataset['processedText'] = processedText
    return dataset

In [10]:
#use the function to create processedText column
dataframeTrain = createProcessedText(dataframeTrain)
dataframeTest = createProcessedText(dataframeTest)

In [11]:
#creating a function that encapsulates the above tasks and can be directly called in training files
def preProcessDataset(datasetTrain, datasetTest):
    datasetTrain = fillNA(datasetTrain)
    datasetTrain = datasetCleaner(datasetTrain)
    datasetTrain = createProcessedText(datasetTrain)
    datasetTest = fillNA(datasetTest)
    datasetTest = datasetCleaner(datasetTest)
    datasetTest = createProcessedText(datasetTest)
    return datasetTrain, datasetTest  