In [1]:
import numpy as np
import pandas as pd
import random, re
from datetime import datetime

In [23]:
df = pd.read_csv('edited_twitter_posts.csv')

In [17]:
# Selecting a random word from tweet as a keyword
def select_random_word(text):
    words = re.split('[. ]+',text)
    random_word = ""
    while random_word == "":
        if random_word == "":
            random_word = random.choice(words).strip().lower()
    return random_word

df['keyword'] = df['Text'].apply(select_random_word)

In [24]:
counts = {}
for x in df['keyword']:
    try:
        counts[x]+=1
    except KeyError:
        counts[x] = 1

In [25]:
sorted(counts.items(), key= lambda item: item[1])

[('pass', 2),
 ('education', 3),
 ('away', 3),
 ('expect', 3),
 ('impact', 3),
 ('though', 3),
 ('determine', 4),
 ('hand', 4),
 ('red', 4),
 ('carry', 4),
 ('near', 4),
 ('often', 4),
 ('else', 4),
 ('mean', 4),
 ('particularly', 4),
 ('environment', 4),
 ('sense', 4),
 ('difficult', 4),
 ('war', 4),
 ('executive', 4),
 ('treat', 4),
 ('show', 4),
 ('nothing', 4),
 ('rate', 5),
 ('entire', 5),
 ('almost', 5),
 ('social', 5),
 ('feel', 5),
 ('nature', 5),
 ('drop', 5),
 ('such', 5),
 ('traditional', 5),
 ('send', 5),
 ('avoid', 5),
 ('and', 5),
 ('data', 5),
 ('professor', 5),
 ('might', 5),
 ('five', 5),
 ('campaign', 5),
 ('color', 5),
 ('even', 5),
 ('benefit', 5),
 ('she', 5),
 ('occur', 5),
 ('which', 5),
 ('able', 5),
 ('by', 5),
 ('group', 5),
 ('usually', 5),
 ('father', 5),
 ('region', 5),
 ('seem', 6),
 ('include', 6),
 ('heart', 6),
 ('thus', 6),
 ('free', 6),
 ('summer', 6),
 ('deal', 6),
 ('assume', 6),
 ('work', 6),
 ('family', 6),
 ('green', 6),
 ('lay', 6),
 ('truth', 6

In [32]:
#get unique usernames
names = set(df.Username)

In [33]:
#generate a location for each user
locations = {'Username' :[], 'lattitude': [], 'longitude': []}
for x in list(names):
    lat = round(random.uniform(40, 50), 2)
    long = round(random.uniform(-40, -30), 2)
    locations['Username'].append(x)
    locations['lattitude'].append(lat)
    locations['longitude'].append(long)
locations = pd.DataFrame(locations)


In [34]:
#join the df of posts with the usernames
postdf = pd.merge(df, locations, on=None)

In [35]:
#convert the date into a unix timestamp
def text_to_timestamp(timestamp_text):
    time = datetime.strptime(timestamp_text, '%Y-%m-%d %H:%M:%S')
    timestamp = time.timestamp()
    return timestamp

#cutoff the date at the end of the period
end_text = "2023-06-15 12:32:09"
unix_end = text_to_timestamp(end_text)

In [36]:
#convert unix timestamp back into date format
def timestamp_to_text(timestamp):
    time = datetime.fromtimestamp(timestamp)
    date = time.strftime('%Y-%m-%d %H:%M:%S')
    return date

In [37]:
#generate random number of seconds after the post and before the cutoff
def random_date_within_range(date):
    while True:
        random_number = np.random.gamma(shape=1.2, scale=10000)
        if 1 <= random_number <= date:
            return random_number

In [38]:
#generate a random date for the date of the interaction
def generate_timestamp(timestamp, before):
    unix = text_to_timestamp(timestamp)
    new_datetime = unix + random_date_within_range(before-unix)
    return timestamp_to_text(new_datetime)

In [39]:
#generate a date and user for each interaction
interactiondict = {'Username':[],'Tweet_ID':[],'Timestamp':[]}
for index,rows in df.iterrows():
    interactions =  rows.Retweets + rows.Likes
    users = random.sample(list(names- set(rows.Username)), interactions)
    for x in users:
        time = generate_timestamp(rows.Timestamp, unix_end)
        interactiondict['Username'].append(x)
        interactiondict['Tweet_ID'].append(rows.Tweet_ID)
        interactiondict['Timestamp'].append(time)

In [40]:
#merge the location and interaction data frames
interactionsdf = pd.DataFrame(interactiondict)
interactionsdf = pd.merge(interactionsdf, locations, on=None)

In [22]:
#write the data frames to csvs
interactionsdf.to_csv('twitter_interactions.csv')
postdf.to_csv('edited_twitter_posts.csv')