# Data Cleaning

In [None]:
#TODO data analysis esplorativa iniziale 
# quanti tweet per politico?
# quanti like per politico?
# quanti retweet?
# ...

# NOTE that here we also have retweets
#for politician in POLITICIANS:
#    print(politician + " " + str(len([tweet for tweet in date_filtered_data[politician] if "RT" not in tweet])))

## Importing libraries and data

In [64]:
import json
import os
import re
import pandas as pd

from datetime import datetime

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from string import punctuation, digits

In [3]:
RELEVANT_FIELDS = ["text", "created_at"]

def read_data(input_directory: str):
    input_data = dict()

    for filename in os.listdir(input_directory):
        if filename.endswith("json"):
            politician_name = filename.split(".")[0]

            file_location = os.path.join(input_directory, filename)
            file = open(file_location, "r")

            tweets = json.load(file)["tweets"]

            filtered_tweets = [{ key: tweet[key] for key in RELEVANT_FIELDS } for tweet in tweets]

            input_data[politician_name] = filtered_tweets
            
        else: 
            raise Exception(f"Input file {filename} has a non supported format.")
    return input_data

In [4]:
input_data = read_data("data")

## Defining cleaning function

- Remove stopwords
- Combine tweets that are part1 and part2
- DONE Remove tweets after a deadline (e.g. the midnight of the election)
- DONE Remove tweets before a deadline (e.g. max 3 months old) -> this is needed because we need to compare similar timeframes
- tokenization?
- stemming / lemmatization


In [None]:
# TODO do we need to remove digits?
# TODO is it bad to split tweets into subsentences? -> TODO join multiple splitted tweets ((2/2))
# TODO note that there are tweets related to pics that we dont have
# TODO how to manage hashtags and citations (#/@)
# TODO create a pipeline funciton to use in pandas distributed-wise

In [36]:
# TODO create a pipeline funciton to use in pandas distributed-wise
def date_filter(tweet: dict, start_date = datetime(2022,7,22), end_date = datetime(2022,9,25)):
    created_at = datetime.strptime(tweet['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ")
    return created_at >= start_date and created_at < end_date

def remove_links(tweet: dict):
    return re.sub(r'http\S+', '', tweet["text"])

def is_retweet(tweet: dict):
    return tweet["text"].startswith("RT @")

def preclean_tweet(tweet: dict):
    if date_filter(tweet) and not is_retweet(tweet):
        return remove_links(tweet)  
    else:
        return ""

In [43]:
POLITICIANS = list(input_data.keys())
date_filtered_data = {politician: list(filter(None, [preclean_tweet(tweet) for tweet in input_data[politician]])) for politician in POLITICIANS}

In [79]:
# Punctuation cleaning

nlp = spacy.load("it_core_news_lg")

corpus = dict()
for politician in POLITICIANS:
    corpus[politician] = []
    for tweet in date_filtered_data[politician]:
        for sentence in nlp(tweet).sents:
            # TODO add readibility/ do better
            sentence_text_w_placeholder = re.sub('\+Europa', 'SPECIFICPOLITICALPARTYPLACEHOLDER', sentence.text, flags=re.IGNORECASE)
            cleaned_sentence_text = "".join([x for x in sentence_text_w_placeholder if x not in punctuation and x not in digits])
            corpus[politician].append(re.sub('SPECIFICPOLITICALPARTYPLACEHOLDER', '+Europa', cleaned_sentence_text))

In [97]:
# Tokenization, Lemmatization, StopWords Remover

tokenize = lambda text: [x.lemma_.lower() for x in nlp(text) if x.pos_ in ['NOUN', 'PROPN']]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus["salvini"])
Xa = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())



In [106]:
Xa.sum().sort_values(ascending=False)

settembrevotolega    197
lega                 181
settembre            148
italiano              96
italia                90
                    ... 
intelligenza           1
insicurezza            1
insetto                1
insegnamento           1
🥲                      1
Length: 1896, dtype: int64


In [107]:
tokenize = lambda text: [x.lemma_.lower() for x in nlp(text) if x.pos_ in ['NOUN', 'PROPN']]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus["calenda"])
Xa = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())



In [108]:
Xa.sum().sort_values(ascending=False)

drago       137
paese       123
italia      106
pd          106
governo     103
           ... 
opinione      1
opificio      1
distanza      1
operaio       1
a             1
Length: 2408, dtype: int64

In [109]:
tokenize = lambda text: [x.lemma_.lower() for x in nlp(text) if x.pos_ in ['NOUN', 'PROPN']]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus["renzi"])
Xa = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
Xa.sum().sort_values(ascending=False)



secondo      17
drago        16
campagna     16
settembre    14
melone       14
             ..
giugno        1
giovedì       1
giovane       1
giorgia       1
♂             1
Length: 542, dtype: int64

In [110]:
tokenize = lambda text: [x.lemma_.lower() for x in nlp(text) if x.pos_ in ['NOUN', 'PROPN']]
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus["conte"])
Xa = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
Xa.sum().sort_values(ascending=False)



sera          58
intervista    58
movstelle     56
grazie        40
diretta       32
              ..
lisola         2
linteresse     2
linea          2
limpegno       2
📺              2
Length: 672, dtype: int64

In [None]:
def clean_data(corpus: list):
    # TODO
    return corpus

## Clean data

In [None]:
out_data = clean_data(input_data)