# Jupyter Notebook function to clean the duplicates between two datasets.

# 1. Required Python libraries imports in order for this notebook to work.

In [1]:
import re
import nltk
import tweepy
import numpy as np
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import json
import seaborn as sb

# 2. The following funtion remove all none text like objects from a tweet, in fact, it is a pure regex function.

In [2]:
def clean_tweets(tweets):
  tweets = re.sub('@(.*?)', '', tweets) # remove tag(@)
  tweets = re.sub('#(.*?)', '', tweets) # remove hashtags(#)
  tweets = re.sub('RT[\s]+', '', tweets) # remove RT
  tweets = re.sub('https?:\/\/\S+', '', tweets) # remove URL links
  tweets = re.sub('\[(.*?)\]', '', tweets) #remove fonts and writing styles, e.g. [$lt...$gt]
  return tweets

# 3. The following function removes all shared tweets from the two input data sets.

In [3]:
def removing_shared(data_a, data_b):
    clean_a, clean_b = [],[]
    for a in data_a:
        if a not in data_b:
            clean_a.append(a)

    for b in data_b:
        if b not in data_a:
            clean_b.append(b)

    return pd.DataFrame(clean_a, columns=['Tweets']), pd.DataFrame(clean_b, columns=['Tweets'])  

# 4. Loading different datasets

In [4]:
def load_dataset(filename):
    file = open(filename)
    data = json.load(file)
    file.close()
    return data

# 5. Applies all functions and saves the results under the "Raw" folder.

# 5.1. Clean tweets data for before/after Elon Musk in case of Donald Trump's related tweets.

In [5]:
DATASET_A = "Raw/data_before_Elon_Musk_DonaldTrump.json"
DATASET_B = "Raw/data_after_Elon_Musk_DonaldTrump.json"

data_a = load_dataset(DATASET_A)
data_b = load_dataset(DATASET_B)

res_a, res_b = removing_shared(data_a, data_b)

res_a.Tweets = res_a.Tweets.apply(clean_tweets)
res_b.Tweets = res_b.Tweets.apply(clean_tweets)

res_a.to_csv("Clean/clean_before_Elon_Musk_DonaldTrump.csv")
res_b.to_csv("Clean/clean_after_Elon_Musk_DonaldTrump.csv")

# 5.2. Clean tweets data for before/after Elon Musk in case of Elon Musk's related tweets.

In [6]:
DATASET_A = "Raw/data_before_Elon_Musk.json"
DATASET_B = "Raw/data_after_Elon_Musk.json"

data_a = load_dataset(DATASET_A)
data_b = load_dataset(DATASET_B)

res_a, res_b = removing_shared(data_a, data_b)

res_a.Tweets = res_a.Tweets.apply(clean_tweets)
res_b.Tweets = res_b.Tweets.apply(clean_tweets)

res_a.to_csv("Clean/clean_before_Elon_Musk.csv")
res_b.to_csv("Clean/clean_after_Elon_Musk.csv")