# MACHINE LEARNING

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

In [None]:
# Load the data
american_politicians_df = pd.read_parquet('../data/american_politicians/parquet/', engine='pyarrow')
# Parse the date column to datetime
american_politicians_df['created_at'] = pd.to_datetime(american_politicians_df['created_at'])
display(american_politicians_df.head())

In [None]:
def plot_scatterplots(df, columns, x_label):
    """Plot scatterplots of the given columns against the x_label.
    
    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to plot.
    columns : list
        The columns to plot.
    x_label : str
        The column to plot on the x-axis.
        """
    nb_rows = len(columns)
    height = 4 * nb_rows
    fig, axes = plt.subplots(nb_rows, 2, figsize=(16, height))
    colors = ['royalblue', 'dodgerblue', 'cornflowerblue', 'skyblue', 'lightsteelblue', 'lightblue', 'lightskyblue', 'powderblue']
    i = 0 

    for column in columns:
        x_label_fmt = x_label.replace('_', ' ').title()
        y_label_fmt = column.replace('_', ' ').title()

        axes = axes.flatten()
        ax1 = axes[i//2*2]
        sns.scatterplot(x=df[x_label], y=df[column], color=colors[i//2], alpha=0.5, ax=ax1)
        sns.regplot(x=df[x_label], y=df[column], color=colors[i//2], scatter=True, ax=ax1)
        ax1.set_xlabel(x_label_fmt, fontsize=8)
        ax1.set_ylabel(y_label_fmt, fontsize=8)
        ax1.set_title(f'{y_label_fmt} as a Function of {x_label_fmt} in Linear Scale', fontsize=9)
        ax1.set_xscale('linear')
        ax1.set_yscale('linear')
        
        ax2 = axes[i//2*2+(i+1)%2]
        sns.scatterplot(x=df[x_label], y=df[column], color=colors[i//2], alpha=0.5, ax=ax2)
        ax2.set_xlabel(x_label_fmt, fontsize=8)
        ax2.set_ylabel(y_label_fmt, fontsize=8)
        ax2.set_title(f'{y_label_fmt} as a Function of {x_label_fmt} in Log Scale', fontsize=9)
        ax2.set_xscale('log')
        ax2.set_yscale('log')
        
        i += 2
    plt.tight_layout()
    plt.show()

In [None]:
# Plot the scatterplots of the engagement counts against impression count
columns = ['retweet_count', 'reply_count', 'like_count', 'quote_count']
# plot_scatterplots(american_politicians_df, columns, 'impression_count')

In [None]:
# Plot the scatterplots of the impression counts against follower counts
columns = ['impression_count']
# plot_scatterplots(american_politicians_df, columns, 'followers_count')

### GET FEATURES FROM DATAFRAME

Tweet info:
- Tweet length  ✅
- Time of the tweet (morning, afternoon, night) ✅
- Sentiment of the tweet (score computed by model) ✅
- Number of hashtags ✅
- Number of mentions ✅
- Number of url's ✅
- Media type (video, image, text,..) => attention c un tableau
- (To clarify: location) 🚧

User info:
- Verified ✅
- Profile creation date ✅
- (To clarify: Tweet frequency) 🚧

# EXTRACTING FEATURES

In [None]:
from pyspark.sql import SparkSession
# Import spark and open json file
spark = SparkSession.builder.config("spark.driver.memory", "20g").getOrCreate()
df = spark.read.json('../data/american_politicians/tweets.jsonl')
json_rdd = df.rdd
spark

In [None]:
json_rdd.flatMap(lambda x: x['data']).map(lambda x: {'text':x['text'],'context_annotations':x['context_annotations'], 'inner_annotations':x['entities']['annotations']}).take(10)

In [None]:
#get total number of tweets
json_rdd.flatMap(lambda x: x['data']).count()

#get total number of tweets without context annotations
json_rdd.flatMap(lambda x: x['data']).filter(lambda x: not x['context_annotations']).count()

In [None]:
# Find the most common context annotations that will be used for clustering later 

annotations = json_rdd.flatMap(lambda x: x['data']).map(lambda x: x['context_annotations']).filter(lambda x: x is not None).flatMap(lambda x: list(set([y['entity']['name'] for y in x]))).map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], ascending=False)


In [None]:
most_frequent_annotations = annotations.filter(lambda x: x[1] > 750).collect()
# remove the first one which is 'Politics' (present in nearly all tweets)
most_frequent_annotations = list(
    map(lambda x: x[0], most_frequent_annotations))[1:]

annotation_dict = {annotation: index for index,
                   annotation in enumerate(most_frequent_annotations)}


In [None]:
annotation_dict

In [None]:
from transformers import pipeline
zero_shot_classifier1 = pipeline('zero-shot-classification',
                                model='facebook/bart-large-mnli')

zero_shot_classifier2 = pipeline('zero-shot-classification', model='roberta-large-mnli')

zero_shot_classifier3 = pipeline(
    'zero-shot-classification', model='huggingface/distilbert-base-uncased-finetuned-mnli')

zero_shot_classifier4 = pipeline('zero-shot-classification', model='valhalla/distilbart-mnli-12-3')

zero_shot_classifier5 = pipeline('zero-shot-classification', model='valhalla/distilbart-mnli-12-9')


#broadcast_classifier = spark.sparkContext.broadcast(zero_shot_classifier)


In [None]:
sentence = 'Joe Biden is the 46th President of the United States, having taken office on January 20, 2021. He served as Vice President under President Barack Obama from 2009 to 2017. His presidency has focused on a range of issues, including COVID-19 pandemic response, climate change, immigration reform, and economic recovery.'


In [None]:
%%time
zero_shot_classifier1(sentence, most_frequent_annotations)

In [None]:
%%time
zero_shot_classifier2(sentence, most_frequent_annotations)


In [None]:
%%time
zero_shot_classifier4(sentence, most_frequent_annotations)


In [None]:
%%time
zero_shot_classifier5(sentence, most_frequent_annotations)


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime
import re

# Function to get the sentiment of a tweet
def analyse_sentiment(x):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(x)
    return vs['compound']

def add_key_value(x, key, value):
    x[key] = value
    return x

def keep_medias_only(x):
    urls = x['tweet_urls']
    if not urls:
        return []
    media_urls = [url['media_key'] for url in urls if 'media_key' in url and url['media_key']]
    return media_urls

def get_number_medias(x):
    return len(x['tweet_media_keys'])

def get_number_external_urls(x):
    return (len(x['tweet_urls']) if x['tweet_urls'] else 0) - x['tweet_medias_count']

def get_period_of_day(x):
    hour = datetime.strptime(x['tweet_date'],"%Y-%m-%dT%H:%M:%S.%fZ").hour
    if hour >= 6 and hour < 12:
        return 'morning'
    elif hour >= 12 and hour < 18:
        return 'afternoon'
    else:
        return 'night'

def one_hot_encoding(x, encoding_dict):
    encoding = [0] * len(encoding_dict)

    annotations = x['context_annotations']

    if not annotations:
        return encoding

    for annotation in annotations:
        if isinstance(annotation,str):
            name = annotation
        
        elif not annotation['entity']:
            continue

        else:
            name = annotation['entity']['name']

        if name in encoding_dict:
           encoding[encoding_dict[name]] = 1

    return encoding

def add_dummy_encoding(x, column_names):
    encoding = dict(zip(column_names, x['encoded_annotations']))

    for key, value in encoding.items():

        cleaned = re.sub('[^A-Za-z0-9_]+', '', key.lower())
        cleaned = re.sub('__', '_', cleaned)

        x[f'dummy_{"_".join(cleaned.split(" "))}'] = value

    return x

def add_dummy_tweet_period(x):
    time_of_day = x['tweet_period']

    x['dummy_tweet_period_morning'] = 0
    x['dummy_tweet_period_afternoon'] = 0
    x['dummy_tweet_period_night'] = 0

    x[f'dummy_tweet_period_{time_of_day}'] = 1

    return x

        

In [None]:
json_rdd_data_fields =json_rdd.filter(lambda x: x['data']).flatMap(lambda x: x['data']).filter(lambda x: x['entities']) \
.map(lambda x : {
    'tweet_text': x['text'],
    'tweet_date': x['created_at'],
    'tweet_hashtags': x['entities']['hashtags'],
    'tweet_mentions': x['entities']['mentions'],
    'tweet_urls': x['entities']['urls'], 
    'user_id': x['author_id'],
    'tweet_id': x['id'],
    'context_annotations': x['context_annotations'] if x['context_annotations'] else [],
    'impression_count': x['public_metrics']['impression_count'],
    # 'retweet_count': x['public_metrics']['retweet_count'],
    # 'reply_count': x['public_metrics']['reply_count'],
    # 'like_count': x['public_metrics']['like_count'],
})

# adding sentiment analysis on the tweet text using vader to the data
json_rdd_data_fields = json_rdd_data_fields.map(lambda x : add_key_value(x, 'tweet_sentiment', analyse_sentiment(x['tweet_text'])))

# adding the tweet length to the data
json_rdd_data_fields = json_rdd_data_fields.map(lambda x : add_key_value(x, 'tweet_length', len(x['tweet_text'])))

# adding the number of hashtags to the data
json_rdd_data_fields = json_rdd_data_fields.map(lambda x : add_key_value(x, 'hashtags_count', len(x['tweet_hashtags'])if x['tweet_hashtags'] else 0))

# adding the number of mentions to the data
json_rdd_data_fields = json_rdd_data_fields.map(lambda x : add_key_value(x, 'mentions_count', len(x['tweet_mentions'])if x['tweet_mentions'] else 0))

# adding the media url's only to the data
json_rdd_data_fields = json_rdd_data_fields.map(lambda x : add_key_value(x, 'tweet_media_keys', keep_medias_only(x)))

# adding the number of medias to the data
json_rdd_data_fields = json_rdd_data_fields.map(lambda x : add_key_value(x, 'tweet_medias_count', get_number_medias(x)))

# adding the number of external urls to the data
json_rdd_data_fields = json_rdd_data_fields.map(lambda x : add_key_value(x, 'tweet_external_urls_count', get_number_external_urls(x)))

# adding the period of the day to the data
json_rdd_data_fields = json_rdd_data_fields.map(lambda x : add_key_value(x, 'tweet_period', get_period_of_day(x)))

json_rdd_data_fields = json_rdd_data_fields.map(lambda x : {k: v for k, v in x.items() if k not in ['tweet_mentions', 'tweet_urls', 'tweet_hashtags', 'tweets_media_count']})


# getting the annotations and putting them in clusters using one hot endcoding
json_rdd_data_fields = json_rdd_data_fields.map(lambda x : add_key_value(x, 'encoded_annotations', one_hot_encoding(x, annotation_dict)))


# GENERATE DUMMY VARIABLES FOR CATEGORICAL VARIABLES

# add dummy variables after one hot encoding
json_rdd_data_fields = json_rdd_data_fields.map(lambda x : add_dummy_encoding(x, most_frequent_annotations))

json_rdd_data_fields = json_rdd_data_fields.map(lambda x : add_dummy_tweet_period(x))

# 2. Create a dataframe from the rdd

# transforming the data to a dataframe
regression_df = json_rdd_data_fields.toDF().drop('context_annotations','encoded_annotations','tweet_date','tweet_media_keys','tweet_period').persist()




In [None]:
regression_df_pd = regression_df.toPandas()

In [None]:
regression_df_pd.to_csv('regression_df.csv', index=False)


In [None]:
def create_regressor_columns_string(columns):
  regressor_columns = list(filter(lambda x: x != 'impression_count',columns))
  regressor_columns_string = "+".join(regressor_columns)
  return regressor_columns_string


In [None]:
# ols regression follower count on the other variables
import statsmodels.formula.api as smf
regressor_columns_string = create_regressor_columns_string(regression_df_pd.columns)
mod = smf.ols(formula=f'impression_count ~ {regressor_columns_string}', data=regression_df_pd)
res = mod.fit()
print(res.summary())

In [None]:
# Remove the variables that are not significant
import statsmodels.formula.api as smf


print(regression_df_pd.columns)

regression_df_pd['impression_count'] = np.log(1+ regression_df_pd['impression_count'])

display(regression_df_pd)

regressor_columns = list(filter(lambda x: x not in [
                         'tweet_external_urls_count', 'dummy_tweet_period_night', 'dummy_joemanchin', 'dummy_chrismurphy', 'dummy_financialservicesbusiness', 'dummy_inflationintheunitedstates', 'dummy_joebiden', 'dummy_politicalfigures', 'dummy_northcarolina', 'dummy_tedcruz', 'dummy_sportsfitnessbusiness', 'dummy_unitedstatescongress', 'dummy_markwarner', 'dummy_politicalnews', 'dummy_chriscoons'], regression_df_pd.columns))

regressor_columns_string = create_regressor_columns_string(
    regressor_columns)
mod = smf.ols(
    formula=f'impression_count ~ {regressor_columns_string}', data=regression_df_pd)
res = mod.fit()
print(res.summary())


In [None]:
json_rdd_includes_fields =json_rdd \
.map(lambda x : {
    'user_profile_creation_date': x['includes']['users'][0]['created_at'],
    'user_verified': x['includes']['users'][0]['verified'],
    'user_creation_date': x['includes']['users'][0]['created_at'],
    'user_id': x['includes']['users'][0]['id'],
})

json_inclued_fields_df = json_rdd_includes_fields.toDF(['user_profile_creation_date', 'user_verified', 'user_creation_date', 'user_id'])

json_inclued_fields_df.count()