In [188]:
import numpy as np
import pandas as pd
import gzip
import json
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from pprint import pprint

In [189]:
#@title Turkish StopWords

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cemaydemir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [190]:
train_classification_df = pd.read_csv("train-classification.csv")
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})
train_classification_df["category"]=train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

In [191]:
train_classification_df.groupby(by="category").count()

Unnamed: 0_level_0,user_id
category,Unnamed: 1_level_1
art,191
entertainment,323
fashion,299
food,511
gaming,13
health and lifestyle,503
mom and children,149
sports,113
tech,346
travel,294


In [192]:
username2_category

{'taskirancemal': 'mom and children',
 'tam_kararinda': 'food',
 'spart4nn': 'food',
 'sosyalyiyiciler': 'food',
 'sonaydizdarahad': 'mom and children',
 'somersivrioglu': 'food',
 'sinankoc': 'travel',
 'simulasyonturk': 'gaming',
 'savas_karakas_sudaki_izler': 'travel',
 'sakinenurunannesi': 'mom and children',
 'ruyabuyuktetik': 'fashion',
 'raykakumru': 'health and lifestyle',
 'pintipanda': 'gaming',
 'pinarindepresyonu': 'health and lifestyle',
 'pinarhotic': 'health and lifestyle',
 'pinardonmez_': 'mom and children',
 'ozgeninoltasi': 'food',
 'nayaozgun': 'fashion',
 'nataliyarcan': 'health and lifestyle',
 'muthispsikoloji': 'health and lifestyle',
 'murattekecicom': 'tech',
 'mrsisbeceren': 'mom and children',
 'mosyosokola': 'food',
 'mirandaorlayn': 'entertainment',
 'mezarci_pubg': 'gaming',
 'mervedemireltaskiran': 'travel',
 'lifewithapineapple': 'travel',
 'lalsefkatli': 'travel',
 'kutupanne': 'mom and children',
 'kosifcihan': 'health and lifestyle',
 'kisikatescom':

In [193]:
train_data_path = "training-dataset.jsonl.gz"
username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()


with gzip.open(train_data_path, "rt") as fh:
  for line in fh:
    sample = json.loads(line)

    profile = sample["profile"]
    username = profile["username"]
    if username in username2_category:
      # train data info
      username2posts_train[username] = sample["posts"]
      username2profile_train[username] = profile


    else:
      # it is test data info
      username2posts_test[username] = sample["posts"]
      username2profile_test[username] = profile


In [194]:
username2posts_train["deparmedya"][0]

{'caption': 'Cumhuriyetimizin 100.yƒ±lƒ± kutlu olsun‚ôæÔ∏èüáπüá∑',
 'comments_count': 0,
 'id': '17990918969458720',
 'like_count': 6,
 'media_type': 'IMAGE',
 'media_url': 'https://scontent-sof1-2.cdninstagram.com/v/t51.29350-15/396342908_267936919574308_4264417069827989599_n.jpg?_nc_cat=107&ccb=1-7&_nc_sid=c4dd86&_nc_ohc=IynXuQSoOT8AX9RSy20&_nc_ht=scontent-sof1-2.cdninstagram.com&edm=AL-3X8kEAAAA&oh=00_AfA8OKAM0MY9tqg6dw8C8I5TJp4SHPBp-VlNXrFAh2agqg&oe=6563581C',
 'timestamp': '2023-10-29 09:12:30'}

In [195]:
username2profile_test['kozayarismasi']

{'username': 'kozayarismasi',
 'id': '2069255615',
 'full_name': 'KOZA',
 'biography': '31. Koza Gen√ß Moda Tasarƒ±mcƒ±larƒ± Yarƒ±≈ümasƒ±\n#Koza2023 #KozaYarismasi',
 'category_name': 'Nonprofit organization',
 'post_count': None,
 'follower_count': 8850,
 'following_count': 300,
 'is_business_account': False,
 'is_private': False,
 'is_verified': False,
 'highlight_reel_count': 43,
 'bio_links': '"[{\'title\': \'\', \'lynx_url\': \'https://l.instagram.com/?u=http%3A%2F%2Fkozayarismasi.com%2F&e=AT0OQsehmPdd1gnOp6Vealz2RzNJH8I2pNBxulllkahoxwelJmW4fuW73V-oNkJNVPVf4kKYFDcTLaKKbpBqsjQGPCcY8OvO\', \'url\': \'http://kozayarismasi.com/\', \'link_type\': \'external\'}]"',
 'entities': '31. Koza Gen√ß Moda Tasarƒ±mcƒ±larƒ± Yarƒ±≈ümasƒ±\n#Koza2023 #KozaYarismasi',
 'ai_agent_type': None,
 'fb_profile_biolink': None,
 'restricted_by_viewer': None,
 'country_block': False,
 'eimu_id': '103876724348934',
 'external_url': 'http://kozayarismasi.com/',
 'fbid': '17841402148468656',
 'has_clips': True,

In [196]:
# Profile Dataframe
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

train_profile_df.head(2)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,deparmedya,3170700063,Depar Medya,#mediaplanning #mediabuying #sosyalmedya,Local business,,1167,192,True,False,...,,,LOCAL,False,False,https://instagram.fsaw2-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,kafesfirin,266439571,KAFES FIRIN,üìçSoÃàgÃÜuÃàtoÃàzuÃàüìçFTZ AVM\nüõíAnkara macro‚ñ≤center v...,Brand,,11997,17,True,False,...,,,BRAND,False,False,https://instagram.fada1-13.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [197]:
test_profile_df.head(2)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,beyazyakaliyiz,8634457436,Selam Beyaz Yakalƒ±,Beyaz yakalƒ±larƒ±n d√ºnyasƒ±na ho≈ügeldiniz üòÄüòÄüòÄ,Personal blog,,1265,665,True,False,...,,,PERSONAL_BLOG,False,False,https://instagram.fist6-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,totalenergies_istasyonlari,7066643793,TotalEnergies IÃástasyonlarƒ±,TotalEnergies ƒ∞stasyonlarƒ± resmi Instagram hes...,Energy Company,,28025,4,True,False,...,,,ENERGY_COMPANY,False,False,https://instagram.fsaw2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [198]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def preprocess_text(text: str):
    # lower casing Turkish Text, Don't use str.lower :)
    text = text.casefold()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and punctuation
    # HERE THE EMOJIS stuff are being removed, you may want to keep them :D
    text = re.sub(r'[^a-z√ßƒüƒ±√∂≈ü√º0-9\s#@]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


corpus = []

# to keep the label order
train_usernames = []

for username, posts in username2posts_train.items():
  train_usernames.append(username)

  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)


  # joining the posts of each user with a \n
  user_post_captions = "\n".join(cleaned_captions)
  corpus.append(user_post_captions)


vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=5000)

# fit the vectorizer
vectorizer.fit(corpus)


# transform the data into vectors
x_post_train = vectorizer.transform(corpus)
y_train = [username2_category.get(uname, "NA") for uname in train_usernames]


test_usernames = []
test_corpus = []
for username, posts in username2posts_test.items():
  test_usernames.append(username)
  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)

  user_post_captions = "\n".join(cleaned_captions)
  test_corpus.append(user_post_captions)


# Just transforming! No Fitting!!!!!
x_post_test = vectorizer.transform(test_corpus)

In [199]:
# Making sure everything is fine
assert y_train.count("NA") == 0

In [200]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['abdullah', 'abone', 'about', ..., '≈üƒ±k', '≈üƒ±klƒ±k', '≈üƒ±klƒ±ƒüƒ±'],
      dtype=object)

In [201]:
df_tfidf = pd.DataFrame(x_post_train.toarray(), columns=feature_names)
df_tfidf.head()

Unnamed: 0,abdullah,abone,about,acele,acil,activities,acƒ±,ad,ada,adam,...,≈üubemiz,≈üubesi,≈ü√∂len,≈ü√∂leni,≈ü√∂yle,≈ü√ºkranla,≈ü√ºk√ºr,≈üƒ±k,≈üƒ±klƒ±k,≈üƒ±klƒ±ƒüƒ±
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050596,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.041839,0.0,0.0,0.0,0.0,0.0
3,0.047264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.046548,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [202]:
import json
import numpy as np

# Load test data from the JSONL file
with open("test-regression-round2.jsonl", "r") as f:
    test_data = [json.loads(line) for line in f]




In [203]:
id_list=[]
for  post in test_data:
    id=post["id"]
    id_list.append(id)
len(id_list)

3000

In [204]:
id_list

['17903451397703117',
 '17896404506845900',
 '17853971531941549',
 '18362044393058713',
 '17999365834969022',
 '18020868037872253',
 '18013287148601185',
 '18019742449567154',
 '18116761582332028',
 '18006379492926193',
 '17998975151218109',
 '18025264618764141',
 '17906977549563245',
 '18013521889517401',
 '17954387045050844',
 '17953476020069513',
 '18030225580657632',
 '17884877039682929',
 '17979533717351528',
 '17975764259290937',
 '17881555487485739',
 '17985820520306917',
 '18036121630589876',
 '17877635093918025',
 '17997401021123841',
 '17984682878229994',
 '18112176310316293',
 '18096068665372342',
 '17958706499341475',
 '18220807072247882',
 '17907682292811483',
 '17967366122058393',
 '18274091311094151',
 '18001100212690547',
 '17854358829013795',
 '18244254190214896',
 '17968601219466127',
 '17980627964589025',
 '17908564031282979',
 '17881416746664586',
 '17960837012289427',
 '17991170914935957',
 '17978934941140170',
 '18340585753072850',
 '17892008867861803',
 '18000955

In [205]:
test_data[100]


{'caption': 'Bu bayramda sevdiklerinizle bir araya gelerek payla≈üƒ±mƒ±n ve dayanƒ±≈ümanƒ±n √∂nemini bir kez daha hatƒ±rlayalƒ±m. Birbirimizi sevgiyle kucaklayalƒ±m ve iyilikleri √ßoƒüaltalƒ±m. Bayramƒ±nƒ±z M√ºbarek Olsun. #kurbanbayramƒ±',
 'comments_count': 12,
 'id': '17990728147926797',
 'media_type': 'VIDEO',
 'media_url': 'https://scontent-sof1-2.cdninstagram.com/o1/v/t16/f1/m82/2E45113E6AF79E4A44C32B383496BF86_video_dashinit.mp4?efg=eyJ2ZW5jb2RlX3RhZyI6InZ0c192b2RfdXJsZ2VuLmNsaXBzLnVua25vd24tQzMuNzIwLmRhc2hfYmFzZWxpbmVfMV92MSJ9&_nc_ht=scontent-sof1-2.cdninstagram.com&_nc_cat=109&vs=651322233166803_599701476&_nc_vs=HBksFQIYT2lnX3hwdl9yZWVsc19wZXJtYW5lbnRfcHJvZC8yRTQ1MTEzRTZBRjc5RTRBNDRDMzJCMzgzNDk2QkY4Nl92aWRlb19kYXNoaW5pdC5tcDQVAALIAQAVAhg6cGFzc3Rocm91Z2hfZXZlcnN0b3JlL0dHMk5ReFdlY0piU2JYb0RBR0g1a01FMkctQW5icV9FQUFBRhUCAsgBACgAGAAbAYgHdXNlX29pbAExFQAAJranstXwxt4%2FFQIoAkMzLBdAJfnbItDlYBgSZGFzaF9iYXNlbGluZV8xX3YxEQB1AAA%3D&ccb=9-4&oh=00_AfDo9xqfyFYRutcl7KaY26FcU7WRjRUM8q1gQBLmGwrB

In [206]:
test_data[0]

{'caption': 'Zirvede online üèÇüèî',
 'comments_count': 3,
 'id': '17903451397703117',
 'media_type': 'IMAGE',
 'media_url': 'https://scontent-sof1-1.cdninstagram.com/v/t51.29350-15/144689983_245771040449649_7477726177662475884_n.jpg?_nc_cat=106&ccb=1-7&_nc_sid=c4dd86&_nc_ohc=vATPYA3eo-QAX_XOiEc&_nc_ht=scontent-sof1-1.cdninstagram.com&edm=AL-3X8kEAAAA&oh=00_AfD66rMt_pwjZFwdiq9RHysCzcuOtUpTfSLOPBs5JML7Hg&oe=65595DD0',
 'timestamp': '2021-02-02 14:31:34',
 'username': 'cemozkaynak'}

In [207]:
df=pd.DataFrame(test_data,columns=list(test_data[0].keys()),index=[np.arange(3000)])
    

In [208]:
df

Unnamed: 0,caption,comments_count,id,media_type,media_url,timestamp,username
0,Zirvede online üèÇüèî,3,17903451397703117,IMAGE,https://scontent-sof1-1.cdninstagram.com/v/t51...,2021-02-02 14:31:34,cemozkaynak
1,Volkswagen Passat B8 orijinal g√∂r√ºn√ºml√º ambiya...,63,17896404506845900,VIDEO,https://scontent-sof1-2.cdninstagram.com/o1/v/...,2023-11-23 14:04:45,uzmananahtar
2,#grwn ‚úåüèªü§ì,27,17853971531941549,VIDEO,,2023-03-29 18:18:23,furkanncagman
3,ü¶ç,125,18362044393058713,CAROUSEL_ALBUM,https://scontent-sof1-1.cdninstagram.com/v/t51...,2023-04-19 17:28:00,ozanyigitt
4,2 G√ºn Abu Dabi‚Äôyi gezdik. Eyl√ºl ortasƒ± olmasƒ±n...,108,17999365834969022,CAROUSEL_ALBUM,https://scontent-sof1-2.cdninstagram.com/v/t51...,2023-09-22 08:17:19,rotasizseyyah
...,...,...,...,...,...,...,...
2995,"7 adƒ±mda ""Kolajenin Faydalarƒ±"" hakkƒ±nda merak ...",2,17977355395361192,CAROUSEL_ALBUM,https://scontent-sof1-1.cdninstagram.com/v/t51...,2021-05-08 10:19:12,lifejentr
2996,Herkese selammmüíô\nBurasƒ± Sinop'un gizli kalmƒ±≈ü...,6,17981812192329769,CAROUSEL_ALBUM,https://scontent-sof1-2.cdninstagram.com/v/t51...,2021-01-13 14:42:59,gezmedenbilemezsin
2997,Doosan alƒ±mƒ± yapan ve Doosan kalitesini terci...,0,17968227499705037,IMAGE,https://scontent-sof1-1.cdninstagram.com/v/t51...,2022-07-30 06:46:14,maatsdoosan
2998,"Mobil internet, Teknosa'da g√ºzel! Dilediƒüin ye...",0,18280073578153697,IMAGE,https://scontent-sof1-2.cdninstagram.com/v/t51...,2023-10-13 09:28:34,teknosacell


In [209]:
def log_mse_like_counts(y_true, y_pred):
  """
  Calculate the Log Mean Squared Error (Log MSE) for like counts (log(like_count + 1)).

  Parameters:
  - y_true: array-like, actual like counts
  - y_pred: array-like, predicted like counts

  Returns:
  - log_mse: float, Log Mean Squared Error
  """
  # Ensure inputs are numpy arrays
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)

  # Log transformation: log(like_count + 1)
  log_y_true = np.log1p(y_true)
  log_y_pred = np.log1p(y_pred)

  # Compute squared errors
  squared_errors = (log_y_true - log_y_pred) ** 2

  # Return the mean of squared errors
  return np.mean(squared_errors)

In [210]:
def highlight_reel_count(name_list):
    list_of_follow=[]
    for name in name_list:
        if name in list(username2profile_test.keys()):
          num_of_follow=username2profile_test[name].get("highlight_reel_count")
          list_of_follow.append(num_of_follow)
        else:
          num_of_follow=username2profile_train[name].get("highlight_reel_count")
          list_of_follow.append(num_of_follow)
    return list_of_follow

In [211]:
def check_type(list_):
    list_element=[]
    for element in list_:
        if  type(element)==int:
            list_element.append(element)
    return list_element

In [212]:
def find_last_like(name_list):
    list_averages=[]
    for name in name_list:
        list_=[]
        if name in list(username2posts_train.keys()):
            for post in username2posts_train[name]:
                like=post.get("like_count")
                list_.append(like)
        if name in list(username2posts_test.keys()):
           for post in username2posts_test[name]:
                like=post.get("like_count")
                if type(like)==None:
                    list.append(0)
                    like=0
                else:
                    list_.append(like)
        average=0
        list_=check_type(list_)
        if len(list_)!=0:
            sum_=list_[0]
            average=sum_
        list_averages.append(average)
    
    return list_averages
    

In [213]:
def find_average_like(name_list):
    list_averages=[]
    for name in name_list:
        list_=[]
        if name in list(username2posts_train.keys()):
            for post in username2posts_train[name]:
                like=post.get("like_count")
                list_.append(like)
        if name in list(username2posts_test.keys()):
           for post in username2posts_test[name]:
                like=post.get("like_count")
                if type(like)==None:
                    list.append(0)
                    like=0
                else:
                    list_.append(like)
        average=0
        list_=check_type(list_)
        if len(list_)!=0:
            sum_=sum(list_)
            average=sum_/len(list_)
        list_averages.append(average)
    
    return list_averages
    
    
    

In [214]:
def find_weighted_average_like(name_list):
    list_weighted_averages = []

    for name in name_list:
        likes_list = []

        # Collect likes from training data
        if name in username2posts_train.keys():
            for post in username2posts_train[name]:
                likes_list.append(post.get("like_count", 0))

        # Collect likes from test data
        if name in username2posts_test.keys():
            for post in username2posts_test[name]:
                likes_list.append(post.get("like_count", 0))

        # Ensure the list is clean and contains only numeric values
        likes_list = check_type(likes_list)

        if len(likes_list) == 0:
            list_weighted_averages.append(0)  # Default to 0 if no likes are available
            continue

        # Assign weights such that the last value has the highest weight
        weights = [i + 1 for i in range(len(likes_list))]
        total_weight = sum(weights)

        # Calculate the weighted average
        weighted_sum = sum(like * weight for like, weight in zip(likes_list, weights))
        weighted_average = weighted_sum / total_weight

        list_weighted_averages.append(weighted_average)

    return list_weighted_averages


In [215]:
def find_maximum_like(name_list):
    list_max_likes = []
    for name in name_list:
        list_ = []
        if name in username2posts_train.keys():
            for post in username2posts_train[name]:
                like = post.get("like_count")
                list_.append(like)
        if name in username2posts_test.keys():
            for post in username2posts_test[name]:
                like = post.get("like_count")
                list_.append(like or 0)  # Handle None values

        list_ = check_type(list_)
        if len(list_) > 0:
            max_like = max(list_)  # Ensure `max` refers to the built-in function
        else:
            max_like = 0
        list_max_likes.append(max_like)

    return list_max_likes


In [216]:
def find_minimum_like(name_list):
    list_max_likes=[]
    for name in name_list:
        list_=[]
        if name in list(username2posts_train.keys()):
            for post in username2posts_train[name]:
                like=post.get("like_count")
                list_.append(like)
        if name in list(username2posts_test.keys()):
           for post in username2posts_test[name]:
                like=post.get("like_count")
                if type(like)==None:
                    list.append(0)
                    like=0
                else:
                    list_.append(like)
        list_=check_type(list_)
        if len(list_)!=0:
          max_like=min(list_)
        else:
            max_like=0
        list_max_likes.append(max_like)
    
    return list_max_likes
    
    
    

In [217]:
def get_num_followers(name_list):
    list_of_follow=[]
    for name in name_list:
        if name in list(username2profile_test.keys()):
          num_of_follow=username2profile_test[name].get("follower_count")
          list_of_follow.append(num_of_follow)
        else:
          num_of_follow=username2profile_train[name].get("follower_count")
          list_of_follow.append(num_of_follow)
    return list_of_follow

In [218]:
def get_num_followings(name_list):
    list_of_follow=[]
    for name in name_list:
        if name in list(username2profile_test.keys()):
          num_of_follow=username2profile_test[name].get("following_count")
          list_of_follow.append(num_of_follow)
        else:
          num_of_follow=username2profile_train[name].get("following_count")
          list_of_follow.append(num_of_follow)
    return list_of_follow

In [219]:
def find_private(name_list):
    list_of_follow=[]
    for name in name_list:
        if name in list(username2profile_test.keys()):
          num_of_follow=username2profile_test[name].get("is_private")
          list_of_follow.append(num_of_follow)
        else:
          num_of_follow=username2profile_train[name].get("is_private")
          list_of_follow.append(num_of_follow)
    return list_of_follow

In [220]:
def find_verified(name_list):
    list_of_follow=[]
    for name in name_list:
        if name in list(username2profile_test.keys()):
          num_of_follow=username2profile_test[name].get('is_verified')
          list_of_follow.append(num_of_follow)
        else:
          num_of_follow=username2profile_train[name].get('is_verified')
          list_of_follow.append(num_of_follow)
    return list_of_follow

In [221]:
# Assuming necessary functions are defined
df["num_followers"] = get_num_followers(df["username"])
# df_full["is_private"] = find_private(df_full["username"])  # Uncomment if function exists
#df_full["is_verified"] = find_verified(df_full["username"])  # Uncomment if function exists
df["average_like"] = find_average_like(df["username"])
df["maximum_like"] = find_maximum_like(df["username"])  # Uses built-in max
df["highlight_reel_count"] = highlight_reel_count(df["username"])
df["minimum_like"] = find_minimum_like(df["username"])
#df["weighted_average_like"] = find_weighted_average_like(df["username"])



In [222]:
df

Unnamed: 0,caption,comments_count,id,media_type,media_url,timestamp,username,num_followers,average_like,maximum_like,highlight_reel_count,minimum_like
0,Zirvede online üèÇüèî,3,17903451397703117,IMAGE,https://scontent-sof1-1.cdninstagram.com/v/t51...,2021-02-02 14:31:34,cemozkaynak,10676,1105.828571,11377,4,250
1,Volkswagen Passat B8 orijinal g√∂r√ºn√ºml√º ambiya...,63,17896404506845900,VIDEO,https://scontent-sof1-2.cdninstagram.com/o1/v/...,2023-11-23 14:04:45,uzmananahtar,58038,1864.200000,5240,69,1165
2,#grwn ‚úåüèªü§ì,27,17853971531941549,VIDEO,,2023-03-29 18:18:23,furkanncagman,72118,2044.285714,8930,20,311
3,ü¶ç,125,18362044393058713,CAROUSEL_ALBUM,https://scontent-sof1-1.cdninstagram.com/v/t51...,2023-04-19 17:28:00,ozanyigitt,153063,22095.971429,74895,0,1424
4,2 G√ºn Abu Dabi‚Äôyi gezdik. Eyl√ºl ortasƒ± olmasƒ±n...,108,17999365834969022,CAROUSEL_ALBUM,https://scontent-sof1-2.cdninstagram.com/v/t51...,2023-09-22 08:17:19,rotasizseyyah,512923,13622.057143,38262,19,3295
...,...,...,...,...,...,...,...,...,...,...,...,...
2995,"7 adƒ±mda ""Kolajenin Faydalarƒ±"" hakkƒ±nda merak ...",2,17977355395361192,CAROUSEL_ALBUM,https://scontent-sof1-1.cdninstagram.com/v/t51...,2021-05-08 10:19:12,lifejentr,566,50.310345,97,4,11
2996,Herkese selammmüíô\nBurasƒ± Sinop'un gizli kalmƒ±≈ü...,6,17981812192329769,CAROUSEL_ALBUM,https://scontent-sof1-2.cdninstagram.com/v/t51...,2021-01-13 14:42:59,gezmedenbilemezsin,10546,291.285714,582,0,36
2997,Doosan alƒ±mƒ± yapan ve Doosan kalitesini terci...,0,17968227499705037,IMAGE,https://scontent-sof1-1.cdninstagram.com/v/t51...,2022-07-30 06:46:14,maatsdoosan,1893,28.085714,71,9,6
2998,"Mobil internet, Teknosa'da g√ºzel! Dilediƒüin ye...",0,18280073578153697,IMAGE,https://scontent-sof1-2.cdninstagram.com/v/t51...,2023-10-13 09:28:34,teknosacell,8820,12.371429,33,0,3


In [223]:
full_dataset=[]
for name in username2posts_train:
    for post in username2posts_train[name]:
        post["username"]=name
        full_dataset.append(post)
for name in username2posts_test:
    for post in username2posts_test[name]:
        post["username"]=name
        full_dataset.append(post)

In [224]:
df_full=pd.DataFrame(full_dataset,columns=full_dataset[0].keys(),index=np.arange(len(full_dataset)))

In [225]:
df_full

Unnamed: 0,caption,comments_count,id,like_count,media_type,media_url,timestamp,username
0,Cumhuriyetimizin 100.yƒ±lƒ± kutlu olsun‚ôæÔ∏èüáπüá∑,0,17990918969458720,6.0,IMAGE,https://scontent-sof1-2.cdninstagram.com/v/t51...,2023-10-29 09:12:30,deparmedya
1,Oriflame Duologi Lansmanƒ± #isve√ßtengeleng√ºzell...,1,18219250732221045,22.0,VIDEO,https://scontent-sof1-2.cdninstagram.com/o1/v/...,2023-08-08 19:11:13,deparmedya
2,#oriflameilesa√ßbakƒ±mdevrimi ‚úåÔ∏è,0,18311380465102328,19.0,VIDEO,https://scontent-sof1-2.cdninstagram.com/o1/v/...,2023-08-07 21:40:54,deparmedya
3,‚úåÔ∏è#oriflameilesa√ßbakƒ±mdevrimi 07Agustos‚Äô23 ori...,1,18089518138361507,19.0,VIDEO,https://scontent-sof1-2.cdninstagram.com/o1/v/...,2023-08-07 21:27:48,deparmedya
4,07 Agustos‚Äô23 #oriflameturkiye #duoloji,0,18012743929758497,21.0,VIDEO,https://scontent-sof1-2.cdninstagram.com/o1/v/...,2023-08-07 21:12:06,deparmedya
...,...,...,...,...,...,...,...,...
187297,Alƒ±≈üƒ±lmƒ±≈üƒ±n Dƒ±≈üƒ±na √áƒ±k!\n\nüìçƒ∞≈ü√ßi Bloklarƒ± mh. ...,1,17970896356402818,34.0,CAROUSEL_ALBUM,https://scontent-sof1-1.cdninstagram.com/v/t51...,2021-06-07 07:49:51,tetrancaffeine
187298,Bayramƒ±nƒ±z Kutlu olsunü•≥ü§©,0,17930502130524900,36.0,IMAGE,https://scontent-sof1-2.cdninstagram.com/v/t51...,2021-05-13 12:21:14,tetrancaffeine
187299,Ben Bir Kahve A≈üƒ±ƒüƒ±yƒ±m!\n\nƒ∞ddialƒ± Kahvelerin ...,2,17889308990150405,79.0,IMAGE,https://scontent-sof1-1.cdninstagram.com/v/t51...,2021-05-08 16:02:10,tetrancaffeine
187300,"Sonu√ßlar 332plus sayfasƒ±ndan,11.05.2021 tarihi...",380,17869632995380257,199.0,IMAGE,https://scontent-sof1-1.cdninstagram.com/v/t51...,2021-05-07 15:31:15,tetrancaffeine


In [226]:
# Assuming necessary functions are defined
df_full["num_followers"] = get_num_followers(df_full["username"])
# df_full["is_private"] = find_private(df_full["username"])  # Uncomment if function exists
#df_full["is_verified"] = find_verified(df_full["username"])  # Uncomment if function exists
df_full["average_like"] = find_average_like(df_full["username"])
df_full["maximum_like"] = find_maximum_like(df_full["username"])  # Uses built-in max
df_full["highlight_reel_count"] = highlight_reel_count(df_full["username"])
df_full["minimum_like"] = find_minimum_like(df_full["username"])
#df_full["weighted_average_like"] = find_weighted_average_like(df_full["username"])


In [227]:
df_full=df_full.dropna()


In [228]:
df_full

Unnamed: 0,caption,comments_count,id,like_count,media_type,media_url,timestamp,username,num_followers,average_like,maximum_like,highlight_reel_count,minimum_like
0,Cumhuriyetimizin 100.yƒ±lƒ± kutlu olsun‚ôæÔ∏èüáπüá∑,0,17990918969458720,6.0,IMAGE,https://scontent-sof1-2.cdninstagram.com/v/t51...,2023-10-29 09:12:30,deparmedya,1167,11.542857,26,6,4
1,Oriflame Duologi Lansmanƒ± #isve√ßtengeleng√ºzell...,1,18219250732221045,22.0,VIDEO,https://scontent-sof1-2.cdninstagram.com/o1/v/...,2023-08-08 19:11:13,deparmedya,1167,11.542857,26,6,4
2,#oriflameilesa√ßbakƒ±mdevrimi ‚úåÔ∏è,0,18311380465102328,19.0,VIDEO,https://scontent-sof1-2.cdninstagram.com/o1/v/...,2023-08-07 21:40:54,deparmedya,1167,11.542857,26,6,4
3,‚úåÔ∏è#oriflameilesa√ßbakƒ±mdevrimi 07Agustos‚Äô23 ori...,1,18089518138361507,19.0,VIDEO,https://scontent-sof1-2.cdninstagram.com/o1/v/...,2023-08-07 21:27:48,deparmedya,1167,11.542857,26,6,4
4,07 Agustos‚Äô23 #oriflameturkiye #duoloji,0,18012743929758497,21.0,VIDEO,https://scontent-sof1-2.cdninstagram.com/o1/v/...,2023-08-07 21:12:06,deparmedya,1167,11.542857,26,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
187297,Alƒ±≈üƒ±lmƒ±≈üƒ±n Dƒ±≈üƒ±na √áƒ±k!\n\nüìçƒ∞≈ü√ßi Bloklarƒ± mh. ...,1,17970896356402818,34.0,CAROUSEL_ALBUM,https://scontent-sof1-1.cdninstagram.com/v/t51...,2021-06-07 07:49:51,tetrancaffeine,1672,38.428571,199,4,18
187298,Bayramƒ±nƒ±z Kutlu olsunü•≥ü§©,0,17930502130524900,36.0,IMAGE,https://scontent-sof1-2.cdninstagram.com/v/t51...,2021-05-13 12:21:14,tetrancaffeine,1672,38.428571,199,4,18
187299,Ben Bir Kahve A≈üƒ±ƒüƒ±yƒ±m!\n\nƒ∞ddialƒ± Kahvelerin ...,2,17889308990150405,79.0,IMAGE,https://scontent-sof1-1.cdninstagram.com/v/t51...,2021-05-08 16:02:10,tetrancaffeine,1672,38.428571,199,4,18
187300,"Sonu√ßlar 332plus sayfasƒ±ndan,11.05.2021 tarihi...",380,17869632995380257,199.0,IMAGE,https://scontent-sof1-1.cdninstagram.com/v/t51...,2021-05-07 15:31:15,tetrancaffeine,1672,38.428571,199,4,18


CLEANƒ∞NG DF_FULL

In [229]:
#df_full = df_full[df_full["maximum_like"] != 0].reset_index(drop=True)
df_full=df_full.drop(columns=["caption","id","media_type","media_url","username","timestamp"])
df_test=df.drop(columns=["caption","id","media_type","media_url","username","timestamp"])

In [230]:
y=df_full["like_count"]

In [231]:
X=df_full.drop(columns="like_count")

In [232]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

In [233]:
original_train=X_train.copy()
original_y_train=y_train.copy()

In [234]:
X_train

Unnamed: 0,comments_count,num_followers,average_like,maximum_like,highlight_reel_count,minimum_like
137956,109,25503,770.142857,2422,1,332
94186,4,182772,900.628571,5741,8,260
186826,0,41276,22.457143,56,8,10
62230,0,18902,52.628571,166,7,17
132073,0,31319,213.314286,575,8,73
...,...,...,...,...,...,...
135938,0,16739,48.171429,99,9,24
118049,5,765734,4421.257143,91131,71,106
149454,2,76928,171.371429,331,23,63
166422,69,185517,3020.285714,15288,44,105


In [235]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
# Step 3: Preprocess Features
imputer = SimpleImputer(strategy="mean")  # Handle missing values
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

scaler = StandardScaler()  # Standardize features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Log-transform target variable
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# Step 4: Train the Model
model = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)
model.fit(X_train_scaled, y_train_log)

# Step 5: Predict and Evaluate
y_pred_log = model.predict(X_test_scaled)
y_pred = np.expm1(y_pred_log)

model_log_mse = log_mse_like_counts(y_test, y_pred)
print(f"Model Log MSE: {model_log_mse}")

Model Log MSE: 0.3378118312603661


In [254]:
import numpy as np
from sklearn.metrics import mean_squared_error

# Ensure X_test and y_test are numpy arrays
X_test_array = np.array(X_test_scaled)
y_test_array = np.array(y_test)

# Parameters
sample_size = 3000
num_iterations = 1000  # Number of times to run the sampling
errors = []

# Multiple runs
for i in range(num_iterations):
    random_indices = np.random.choice(len(X_test_array), sample_size, replace=False)
    X_test_sampled = X_test_array[random_indices]
    y_test_sampled = y_test_array[random_indices]

    # Predict on the sampled test set
    y_pred_log_sampled = model.predict(X_test_sampled)
    y_pred_sampled = np.expm1(y_pred_log_sampled)

    # Evaluate on the sampled set
    model_log_mse_sampled = log_mse_like_counts(y_test_sampled, y_pred_sampled)
    errors.append(model_log_mse_sampled)
    

# Find the maximum error
max_error = max(errors)
max_error


0.3916743308603251

In [237]:
y_pred_log = model.predict(X_train_scaled)
y_pred = np.expm1(y_pred_log)

model_log_mse = log_mse_like_counts(y_train, y_pred)
print(f"Model Log MSE: {model_log_mse}")

Model Log MSE: 0.32416074671909606


In [238]:
original_train["true_like_count"]=original_y_train
original_train["estimated_like_count"]=y_pred

In [239]:
original_train[100:130]

Unnamed: 0,comments_count,num_followers,average_like,maximum_like,highlight_reel_count,minimum_like,true_like_count,estimated_like_count
129809,0,2801,11.714286,29,1,2,19.0,9.399768
96860,7,6400,169.142857,699,4,47,128.0,184.022677
37240,0,2539,61.142857,121,0,15,68.0,47.948718
76315,7,4842,117.628571,238,3,21,150.0,142.071146
128133,3,46484,123.628571,469,79,25,157.0,112.43973
55305,6,6839,35.342857,129,5,15,21.0,44.420382
119069,11,44964,274.057143,821,6,86,821.0,357.160522
153844,0,3374,6.542857,15,7,2,3.0,5.485859
147892,0,3186,37.085714,67,12,22,36.0,35.42861
56958,3,8899,36.714286,115,0,17,115.0,41.745214


In [240]:
df_test = imputer.fit_transform(df_test)
scaler = StandardScaler()  # Standardize features
df_test_scaled = scaler.fit_transform(df_test)
y_pred_log=model.predict(df_test_scaled)
y_pred_=np.expm1(y_pred_log)


In [241]:
list_estimation=list(y_pred_)

In [242]:
id_list_test=list(df["id"])

In [243]:
dict_values=dict()
dict_values={k:v for (k,v) in zip(id_list,list_estimation)}

In [244]:
output_file = "predictions_test-regression-round1.json"
try:
    with open(output_file, "w") as f:
        json.dump(dict_values, f, indent=4)
    print(f"Predictions saved to {output_file}")
except Exception as e:
    print(f"Error writing to file: {e}")

Predictions saved to predictions_test-regression-round1.json


In [255]:
import json
file_path="predictions_test-regression-round1.json" 
with open(file_path,"r") as file:
    data=json.load(file)

5

In [260]:
dict_round=dict()
for id,estimate in data.items():
    dict_round[id]=round(estimate)

In [262]:
dict_round

{'17903451397703117': 8,
 '17896404506845900': 575,
 '17853971531941549': 640,
 '18362044393058713': 17989,
 '17999365834969022': 15162,
 '18020868037872253': 79,
 '18013287148601185': 66,
 '18019742449567154': 318,
 '18116761582332028': 5314,
 '18006379492926193': 66,
 '17998975151218109': 85,
 '18025264618764141': 107,
 '17906977549563245': 65,
 '18013521889517401': 66,
 '17954387045050844': 74,
 '17953476020069513': 2309,
 '18030225580657632': 73,
 '17884877039682929': 68,
 '17979533717351528': 59,
 '17975764259290937': 56,
 '17881555487485739': 5025,
 '17985820520306917': 134,
 '18036121630589876': 64,
 '17877635093918025': 66,
 '17997401021123841': 73,
 '17984682878229994': 20102,
 '18112176310316293': 64,
 '18096068665372342': 138,
 '17958706499341475': 84,
 '18220807072247882': 65,
 '17907682292811483': 5944,
 '17967366122058393': 3463,
 '18274091311094151': 126,
 '18001100212690547': 6124,
 '17854358829013795': 64,
 '18244254190214896': 71,
 '17968601219466127': 84,
 '179806279

In [263]:
output_file = "predictions_test-regression-round1.json"
try:
    with open(output_file, "w") as f:
        json.dump(dict_round, f, indent=4)
    print(f"Predictions saved to {output_file}")
except Exception as e:
    print(f"Error writing to file: {e}")

Predictions saved to predictions_test-regression-round1.json
