<h1 align="center"> LSIR LAB - SEMESTER PROJECT</h1> 

**Students:**
- Etienne BRUNO
- Abiola ADEYE

In [None]:
import pandas as pd
import json
from pyspark.sql import SparkSession

## LOAD SAMPLE DATA

In [None]:
# Import spark and open json file
spark = SparkSession.builder.getOrCreate()
df = spark.read.json('../data/sample/sample.jsonl')
json_rdd = df.rdd
df.printSchema()
spark

### ENGLISH SAMPLE

In [None]:
# Load English tweets into an rdd
json_rdd_en = json_rdd.filter(lambda x: x['includes'] and x['data'] and x['data']['lang'] == 'en')

# Create a dataframe with the data we want for the English tweets
# id, author_id, created_at, retweet_count, reply_count, like_count, quote_count, impression_count, followers_count
# We need to use the index [0] because the includes field is a list of users and we only want the first one as the tweet is only associated with one user
# the other users are the users mentioned in the tweet (if any)
en_df = json_rdd_en.map(lambda x: [ x['data']['id'],
                                    x['data']['author_id'],
                                    x['data']['created_at'],
                                    x['data']['public_metrics']['retweet_count'],
                                    x['data']['public_metrics']['reply_count'],
                                    x['data']['public_metrics']['like_count'],
                                    x['data']['public_metrics']['quote_count'],
                                    x['data']['public_metrics']['impression_count'],
                                    x['includes']['users'][0]['public_metrics']['followers_count']]).toDF(['id', 'author_id', 'created_at', 'retweet_count', 'reply_count', 'like_count', 'quote_count', 'impression_count', 'followers_count'])
en_df.show(3)

# Save as parquet
en_df.write.parquet('../data/sample/sample_en_parquet/')

### FRENCH SAMPLE

In [None]:
# Load French tweets into an rdd
json_rdd_fr = json_rdd.filter(lambda x: x['includes'] and x['data'] and x['data']['lang'] == 'fr')

# Create a dataframe with the data we want for the French tweets
# id, author_id, created_at, retweet_count, reply_count, like_count, quote_count, impression_count, followers_count
# We need to use the index [0] because the includes field is a list of users and we only want the first one as the tweet is only associated with one user
# the other users are the users mentioned in the tweet (if any)
fr_df = json_rdd_fr.map(lambda x: [ x['data']['id'],
                                    x['data']['author_id'],
                                    x['data']['created_at'],
                                    x['data']['public_metrics']['retweet_count'],
                                    x['data']['public_metrics']['reply_count'],
                                    x['data']['public_metrics']['like_count'],
                                    x['data']['public_metrics']['quote_count'],
                                    x['data']['public_metrics']['impression_count'],
                                    x['includes']['users'][0]['public_metrics']['followers_count']]).toDF(['id', 'author_id', 'created_at', 'retweet_count', 'reply_count', 'like_count', 'quote_count', 'impression_count', 'followers_count'])
en_df.show(3)

# Save as parquet
fr_df.write.parquet('../data/sample/sample_fr_parquet/')

## LOAD POLITICIANS DATA

### HELPERS FUNCTIONS

In [None]:
def extract_tweet_data(x):
    return [
        x['id'],
        x['author_id'],
        x['created_at'],
        x['public_metrics']['retweet_count'],
        x['public_metrics']['reply_count'],
        x['public_metrics']['like_count'],
        x['public_metrics']['quote_count'],
        x['public_metrics']['impression_count'],
    ]


def extract_user_metadata(x):
    return [
        x['id'],
        x['public_metrics']['followers_count'],
        x['public_metrics']['following_count'],
        x['public_metrics']['tweet_count'],
        x['public_metrics']['listed_count'],
    ]


tweet_data_columns = ['id', 'author_id', 'created_at', 'retweet_count', 'reply_count', 'like_count', 'quote_count', 'impression_count']
user_metadata_columns = ['author_id','followers_count', 'following_count', 'tweet_count', 'listed_count']


def tweet_and_user_data(json_rdd):
    json_rdd_tweet_data = json_rdd.filter(lambda x: x['data']) \
                        .flatMap(lambda x: x['data']) \
                        .map(extract_tweet_data) \
                        .toDF(tweet_data_columns)
    
    json_rdd_user_data = json_rdd.filter(lambda x: x['includes']) \
                        .map(lambda x: x['includes']['users'][0]) \
                        .map(extract_user_metadata) \
                        .toDF(user_metadata_columns)
    
    return json_rdd_tweet_data.join(json_rdd_user_data, on='author_id', how='left')

### US POLITICIANS

In [None]:
spark = SparkSession.builder.getOrCreate()
df = spark.read.json('../data/american_politicians/tweets.jsonl')
json_rdd = df.rdd
#df.printSchema()
spark

In [None]:
# Get the tweet and user data only from the rdd
us_politicians_df = tweet_and_user_data(json_rdd)
us_politicians_df.show(3)

In [None]:
# Save as parquet
us_politicians_df.write.parquet('../data/american_politicians/parquet/')

### FRENCH POLITICIANS

In [None]:
# Load French politicians tweets into an rdd
spark = SparkSession.builder.getOrCreate()
df = spark.read.json('../data/french_politicians/tweets.jsonl')
json_rdd = df.rdd
#df.printSchema()
spark

In [None]:
# Get the tweet and user data only from the rdd
fr_politicians_df = tweet_and_user_data(json_rdd)
fr_politicians_df.show(3)

In [None]:
# Save as parquet
fr_politicians_df.write.parquet('../data/french_politicians/parquet/')

## LOAD CELEBRITIES DATA

### US CELEBRITIES

In [None]:
# Load US celebrities tweets into an rdd
spark = SparkSession.builder.getOrCreate()
df = spark.read.json('../data/american_celebrities/tweets.jsonl')
json_rdd = df.rdd
#df.printSchema()
spark

In [None]:
# Get the tweet and user data only from the rdd
us_celebrities_df = tweet_and_user_data(json_rdd)
us_celebrities_df.show(3)

In [None]:
# Save as parquet
us_celebrities_df.write.parquet('../data/american_celebrities/parquet/')

### FRENCH CELEBRITIES

In [None]:
# Load French celebrities tweets into an rdd
spark = SparkSession.builder.getOrCreate()
df = spark.read.json('../data/french_celebrities/tweets.jsonl')
json_rdd = df.rdd
#df.printSchema()
spark

In [None]:
# Get the tweet and user data only from the rdd
fr_celebrities_df = tweet_and_user_data(json_rdd)
fr_celebrities_df.show(3)

In [None]:
# Save as parquet
fr_celebrities_df.write.parquet('../data/french_celebrities/parquet/')