# Data preparation - Twitter posts
We prepare Twitter data for later topic modeling.

In [1]:
import os
import sys
import re
import glob
import numpy as np
import pandas as pd
import random
from joblib import Parallel, delayed
from datetime import datetime, timedelta
from pandas_profiling import ProfileReport

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging

logging.basicConfig(level=logging.INFO)

from tqdm import tqdm
import matplotlib.pyplot as plt
from pathlib import Path

import boto3
import pandas as pd
import io

### Global variables

In [2]:
S3_BUCKET = "data.atoti.io"
S3_FOLDER = "notebooks/influencers-analysis/"
CURRENT_MONTH = "May"
DATA_PATH = Path("../data")

## Load data
For this example, let's loads our posts from Twitter.

In [3]:
def read_prefix_to_df(bucket_name, prefix, suffix):
    s3 = boto3.resource("s3")
    bucket = s3.Bucket(bucket_name)
    prefix_objs = bucket.objects.filter(Prefix=prefix)
    prefix_df = pd.DataFrame()
    cols = [
        "tweetLink",
        "retweet_count",
        "favorite_count",
        "reply_count",
        "quote_count",
        "text",
        "profileUrl",
        "name",
    ]
    for obj in prefix_objs:
        key = obj.key
        body = obj.get()["Body"].read()
        if suffix in key and not key.endswith("/"):
            if ".xlsx" in key:
                df = pd.read_excel(io.BytesIO(body), engine="openpyxl")
            elif ".csv" in key:
                df = pd.read_csv(io.BytesIO(body), encoding="utf8")
            df = df[df["text_length"] >= 1][cols]
            prefix_df = pd.concat([prefix_df, df])
    return prefix_df.reset_index(drop=True)

In [4]:
# Load all the posts corresponding to the desired month
df = read_prefix_to_df(S3_BUCKET, S3_FOLDER, CURRENT_MONTH)

print(f"\nData size: {df.shape}\n\n")
df.head()

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials



Data size: (14304, 8)




Unnamed: 0,tweetLink,retweet_count,favorite_count,reply_count,quote_count,text,profileUrl,name
0,https://twitter.com/alvinfoo/status/1391184083...,10,22,7,1,Happy Mother’s Day! \n#happymothersday2021 #Ha...,https://twitter.com/alvinfoo,Alvin Foo
1,https://twitter.com/jblefevre60/status/1391340...,13,0,0,0,RT @pierrepinna: My buddy Jay the gull wishes ...,https://twitter.com/jblefevre60,Jean-Baptiste Lefevre
2,https://twitter.com/jblefevre60/status/1391391...,2,0,0,0,RT @Bastwins: Happy electric birthday to #guit...,https://twitter.com/jblefevre60,Jean-Baptiste Lefevre
3,https://twitter.com/jblefevre60/status/1391441...,11,0,0,0,RT @HaroldSinnott: What Is #5G ? \n\nWhere Is ...,https://twitter.com/jblefevre60,Jean-Baptiste Lefevre
4,https://twitter.com/jblefevre60/status/1391441...,17,0,0,0,RT @ipfconline1: How Do #NeuralNetworks Learn?...,https://twitter.com/jblefevre60,Jean-Baptiste Lefevre


In [5]:
ProfileReport(df)

Summarize dataset:   0%|          | 0/21 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Number of posts per Person Of Interest

In [6]:
print(f"Number of influencers: {len(df['name'].unique())}")

Number of influencers: 146


In [7]:
df["name"].value_counts()

Bob E. Hayes         583
KDnuggets            469
Tamara McCleary      465
Yves Mulkers         460
Kirk Borne           460
                    ... 
Misha Denil            1
Oriol Vinyals          1
Garrett Smith          1
Christopher Doyle      1
Jonathan Del Hoyo      1
Name: name, Length: 146, dtype: int64

In [8]:
df["name"].value_counts(normalize=True)

Bob E. Hayes         0.040758
KDnuggets            0.032788
Tamara McCleary      0.032508
Yves Mulkers         0.032159
Kirk Borne           0.032159
                       ...   
Misha Denil          0.000070
Oriol Vinyals        0.000070
Garrett Smith        0.000070
Christopher Doyle    0.000070
Jonathan Del Hoyo    0.000070
Name: name, Length: 146, dtype: float64

In [9]:
df = df.drop_duplicates(subset=["tweetLink"])
df = df.reset_index(drop=True)

print(f"Data size: {df.shape}\n")
df.head()

Data size: (11002, 8)



Unnamed: 0,tweetLink,retweet_count,favorite_count,reply_count,quote_count,text,profileUrl,name
0,https://twitter.com/alvinfoo/status/1391184083...,10,22,7,1,Happy Mother’s Day! \n#happymothersday2021 #Ha...,https://twitter.com/alvinfoo,Alvin Foo
1,https://twitter.com/jblefevre60/status/1391340...,13,0,0,0,RT @pierrepinna: My buddy Jay the gull wishes ...,https://twitter.com/jblefevre60,Jean-Baptiste Lefevre
2,https://twitter.com/jblefevre60/status/1391391...,2,0,0,0,RT @Bastwins: Happy electric birthday to #guit...,https://twitter.com/jblefevre60,Jean-Baptiste Lefevre
3,https://twitter.com/jblefevre60/status/1391441...,11,0,0,0,RT @HaroldSinnott: What Is #5G ? \n\nWhere Is ...,https://twitter.com/jblefevre60,Jean-Baptiste Lefevre
4,https://twitter.com/jblefevre60/status/1391441...,17,0,0,0,RT @ipfconline1: How Do #NeuralNetworks Learn?...,https://twitter.com/jblefevre60,Jean-Baptiste Lefevre


In [10]:
ProfileReport(df)

Summarize dataset:   0%|          | 0/21 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



## Save the aggregated data

In [11]:
df.to_csv(DATA_PATH / "tweets_aggregated.csv", index=False)