In [None]:
import numpy as np
import pandas as pd
import requests
import time
import os
import re
import html
import ast

import jsonlines

import wand.image

import cv2 as cv
import json

### Read data

In [None]:
data = pd.read_csv('concatenatedProlife_new.csv').astype(str)

### Function to make GET call

In [None]:
def getUserData(ids):
    headers = {'Authorization':'Bearer '}
    url = "https://api.twitter.com/2/users"
    params = {
        'ids':ids,
        'user.fields':'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,url,username,verified,withheld',
        'tweet.fields':'attachments,author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang'
    }
    
    r = requests.get(url=url, params=params, headers=headers)
    
    return r.json()

In [None]:
data.shape[0]

### Make calls on a hopping window of 100

In [None]:
start = 0
collection = []
callCount = 0

while start < data.shape[0]:
    end = start + 100
    ids = ','.join(data.iloc[start:end]['author_id'])
    collection.extend(getUserData(ids)['data'])
    start = end
    
    callCount += 1
    if callCount % 49 == 0:
        print(f"Sleeping for few minutes, {callCount} calls made.")
        time.sleep(1000)

user_data = pd.DataFrame(collection)

In [None]:
temp = pd.read_csv('ConcatenatedUsers.csv')
temp2 = pd.concat([temp,user_data])

In [None]:
temp2.drop_duplicates(['id']).to_csv('ConcatenatedUsers.csv',index=False)

In [None]:
#user_data.to_csv('ConcatenatedUsers.csv',index=False)

### Get profile images

In [None]:
user_data = pd.read_csv('ConcatenatedUsers.csv')

In [None]:
calls = 19100

for index, user in user_data.iloc[19100:].iterrows():
    url = str(user['profile_image_url'])
    userid = str(user['id'])
    
    try:
        img_data = requests.get(url).content
    except:
        continue
    
    with open(f'profile_images/user_{userid}.jpg', 'wb') as handler:
        handler.write(img_data)
    
    calls += 1
    if (calls % 100 == 0):
        print(f'Calls made = {calls}')

### Create JSON Lines file

In [None]:
user_data.head()

In [None]:
json_dicts = []

for _, row in user_data.iterrows():
    
    # Preprocess description
    description = str(row['description'])
    description = re.sub(r"(?:\@|https?\://)\S+", "", description)
    description = html.unescape(description)
    description = re.sub('[^A-Za-z ]+', '', description)
    description = description.strip().lower()
    description = ' '.join(description.split())
    
    # Preprocess name
    name = str(row['name'])
    name = re.sub(r"(?:\@|https?\://)\S+", "", name)
    name = html.unescape(name)
    name = re.sub('[^A-Za-z ]+', '', name)
    name = name.strip().lower()
    name = ' '.join(name.split())
    
    # Preprocess username
    username = str(row['username'])
    username = re.sub(r"(?:\@|https?\://)\S+", "", username)
    username = html.unescape(username)
    username = re.sub('[^A-Za-z ]+', '', username)
    username = username.strip().lower()
    username = ' '.join(username.split())
    
    
    if not os.path.isfile(f'profile_images/user_{row["id"]}.jpg'):
        img = '/content/profile_images/missing_image.jpg'
    else:
        img = f'/content/profile_images/user_{row["id"]}.jpg'
    
    user_dict = {
        "id": str(row['id']),
        "name": name,
        "screen_name": username,
        "description": description,
        "lang": "en",
        "img_path":img
    }
    
    json_dicts.append(user_dict)

In [None]:
# with open("data.jsonl", "w", encoding="utf-8") as text_file:
#     text_file.write(jsonstring)

with jsonlines.open('data.jsonl', 'w') as writer:
    writer.write_all(json_dicts)

### Find grayscale images

In [None]:
collect_graysc = []

for _, row in user_data.iterrows():
    url = f'profile_images_orig/user_{row["id"]}.jpg'
    new_url = f'profile_images/user_{row["id"]}.jpg'
    if os.path.isfile(new_url):
        img = cv.imread(new_url)
        w,h,_ = img.shape
        flag = True
        for i in range(w):
            
            if not flag:
                break
            
            for j in range(h):
                r, g, b = img[i][j]
                if r == g == b: 
                    pass
                else:
                    flag = False
                    break
        
        if flag:
            print(row['id'])
            collect_graysc.append(row["id"])

In [None]:
len(collect_graysc)

### Remove alpha channel

In [None]:
def hasAlpha(image_path):
    with wand.image.Image(filename=image_path) as img:
        alpha = img.alpha_channel
        return alpha

In [None]:
def removeAlpha(image_path, new_image_path):
    with wand.image.Image(filename=image_path) as img:
        img.alpha_channel = 'remove' 
        img.background_color = wand.image.Color('white')
        img.save(filename=new_image_path)

In [None]:
for _, row in user_data.iterrows():
    url = f'profile_images_orig/user_{row["id"]}.jpg'
    new_url = f'profile_images/user_{row["id"]}.jpg'
    if os.path.isfile(url):
        try:
            if row['id'] not in collect_graysc:
                removeAlpha(url, new_url)
        except:
            pass

### Find image size mismatch

In [None]:
for _, row in user_data.iterrows():
    new_url = f'profile_images/user_{row["id"]}.jpg'
    if os.path.isfile(new_url):
        img = cv.imread(new_url)
        w,h,_ = img.shape
        if w != 48 or h != 48:
            print(row['id'])

### Read M3 outputs

In [None]:
f = open('m3_data.json')
m3_data = json.load(f)

In [None]:
m3_data = pd.DataFrame(m3_data).transpose().reset_index().rename(columns={'index':'id'})
m3_data.head()

In [None]:
def getMostProbableKey(x):
    maxVal = 0
    maxKey = None
    for key in x.keys():
        if x[key] > maxVal:
            maxVal = x[key]
            maxKey = key
    return maxKey

m3_data['gender'] = m3_data['gender'].apply(getMostProbableKey)
m3_data['age'] = m3_data['age'].apply(getMostProbableKey)
m3_data['org'] = m3_data['org'].apply(getMostProbableKey)
m3_dat

In [None]:
user_data['id'] = user_data['id'].astype(str)

In [None]:
concatenatedUsers_m3 = pd.merge(user_data,m3_data,on='id')
concatenatedUsers_m3.to_csv('ConcatenatedUsers_m3.csv',index=False)

In [None]:
all_data = pd.read_csv('all_data.csv')
all_data['id']=all_data['id'].astype(str)
f = open('sentiment.txt','r')
sent_collection = ast.literal_eval(f.readlines()[0])

In [None]:
all_data['sentiment']=all_data.id.map(sent_collection)

In [None]:
all_data.to_csv('all_data.csv',index=False)