In [2]:
import pandas as pd
import numpy as np
import git
import os 
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', -1)

In [3]:
def get_git_root(path):
	git_repo = git.Repo(path, search_parent_directories=True)
	git_root = git_repo.git.rev_parse("--show-toplevel")
	return git_root

In [4]:
top_dir = get_git_root(os.getcwd())
input_dir = os.path.join(top_dir,"input")
extraction_new_dir = os.path.join(input_dir,"hexagon_extract_new")
extraction_rem_dir = os.path.join(input_dir,"hexagon_extract")
model_dir = os.path.join(get_git_root(os.getcwd()),"models")

In [5]:
from tqdm.auto import tqdm
def get_consolidated(dir_path):
    files = [file for file in os.listdir(dir_path) if file.endswith(".csv")]
    li = []
    for file in tqdm(files):
        file_path = os.path.join(dir_path,file,)
        df_temp = pd.read_csv(file_path,lineterminator= "\n")
        li.append(df_temp)
    return li

### consolidating the files for weed data

In [None]:
dir_path = os.path.join(input_dir,"weed_data_extract")
df_weed = pd.concat(frames,axis=0,ignore_index=True)

In [18]:
print(len(df_weed))
print(len(df_weed.userID.unique()))

10420444

## merging the weed and juul data

In [19]:
df_weed = pd.read_csv(os.path.join(input_dir,"weed_data.csv"),lineterminator="\n")

554815

In [10]:
# one sub file for just tweetText and userID
df_sub = df[['favourites_count','followersCount', 'friendsCount', 'hashtags','listedCount', 'retweetCount', 'retweetText',
       'retweeted', 'statusesCount', 'tweetCreatedAt', 'tweetId', 'tweetText', 'userID',]]

df_sub.to_csv(os.path.join(input_dir,"weed_data_sub.csv"))



In [8]:
# one file for just tweetText and userID
df_sub = df[['hashtags','retweetText','tweetCreatedAt','tweetId', 'tweetText','userID',]]
df_sub.to_csv(os.path.join(input_dir,"weed_data_text.csv"),index=False)

In [61]:
# one file for just tweetText and userID with retweetText column removed
df_sub = pd.read_csv(os.path.join(input_dir,"weed_data_text.csv"),lineterminator="\n",index_col=0)
ids_ = df_sub.loc[df_sub['retweetText'].notnull()]["tweetId"]
df_sub.loc[df_sub.tweetId.isin(ids_),"tweetText"] = df_sub.loc[df_sub.tweetId.isin(ids_)]["retweetText"]
df_sub = df_sub.drop(["hashtags","retweetText"],axis=1)
df_sub.to_csv(os.path.join(input_dir,"weed_data_text2.csv"),index=False)

### merging with selected columns (reducing the size)

In [6]:
## merging juul and weed data
df_weed = pd.read_csv(os.path.join(input_dir,"weed_data_text2.csv"),lineterminator="\n")
df_juul = pd.read_csv(os.path.join(input_dir,"juul_data.csv"),lineterminator="\n")

weed_users = (df_weed.userID.unique())
print("weed_users",len(weed_users))
print("weed_data,",len(df_weed))

print()
juul_users =(df_juul.userID.unique())
print("juul_users",len(juul_users))
print("juul data",len(df_juul))

weed_users 554815
weed_data, 10420444

juul_users 887180
juul data 1692201


In [40]:
## regex will not cover everything, our data itself has been curated using the boolean so we dont need to do matching
print(len(df_juul))
print(len(df_juul.loc[df_juul.tweetText.str.contains("juul|Juul|#juul",case=False)]))

1692201
1671156


In [13]:
## removing rest of the columns not required
tweetIDs = df_juul["tweetId"].loc[df_juul["retweetText"].notnull()]
df_juul.loc[df_juul.tweetId.isin(tweetIDs),"tweetText"] = df_juul.loc[df_juul.tweetId.isin(tweetIDs)]["retweetText"]
print("juul users",len(df_juul.userID.unique())) ## sanity check
df_juul = df_juul[list(df_weed.columns)]

juul users 887180


In [17]:
frames = [df_weed,df_juul]
df_dataset = pd.concat(frames,ignore_index=True)
columns = ["userID","tweetId","tweetCreatedAt","tweetText"]
df_dataset = df_dataset[columns]

In [28]:
## there are some extra users
extra_users = (list(set(weed_users) - set(juul_users)))
print(len(extra_users))
## it does not make sense to add data that were never teh part of extraction process so we filter it
df_dataset = (df_dataset.loc[~df_dataset.userID.isin(extra_users)])

3554


In [29]:
print(len(df_dataset))
print(len(df_dataset.userID.unique()))

12088584
887180


In [44]:
df_dataset.to_csv(os.path.join(input_dir,"dataset.csv"),index=False)

In [46]:
## looking at the merged data ## sanity check
df_dataset = pd.read_csv(os.path.join(input_dir,"dataset.csv"),lineterminator="\n")
print(len(df_dataset))   # 12,088,584
print(len(df_dataset.userID.unique())) # 887,180

12088584
887180


## get the first occurances

In [7]:
df_weed = df_weed.sort_values(by=["tweetCreatedAt"])
df_weed["tweetCreatedAt"] = pd.to_datetime(df_weed["tweetCreatedAt"])
first_weed = df_weed.groupby(["userID"])["tweetCreatedAt"].first()
first_weed = first_weed.reset_index()

## get the first occurance for juul
df_juul = df_juul.sort_values(by=["tweetCreatedAt"])
df_juul["tweetCreatedAt"] = pd.to_datetime(df_juul["tweetCreatedAt"])
first_juul = df_juul.groupby(["userID"])["tweetCreatedAt"].first()
first_juul = first_juul.reset_index()

# columns
first_weed.columns = ["userID","weed_first"]
first_juul.columns = ["userID","juul_first"]

In [8]:
## inner join to forms first of both weed and juul (it will be left as there are some extra users in weed_data)
first_data = (first_juul.join(first_weed.set_index("userID"),on='userID',))

In [9]:
first_data.to_csv(os.path.join(input_dir,"user_first.csv"),index=False)

In [11]:
first_data = pd.read_csv(os.path.join(input_dir,"user_first.csv"))