In [1]:
!pip install numpy pandas faker scikit-learn nltk

Collecting numpy
  Using cached numpy-2.2.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Co

In [2]:
import numpy as np
import pandas as pd
from faker import Faker
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re

In [3]:
# Initialize Faker and download NLTK resources
fake = Faker()
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/snt/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/snt/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Data Generation

In [4]:
def generate_synthetic_data(num_users=1000, num_tweets=5000, num_interactions=20000):
    # Generate users
    users = pd.DataFrame([{
        'user_id': i,
        'join_date': fake.date_this_decade(),
        'location': fake.city(),
        'bio': fake.sentence(),
        'verified': np.random.choice([0, 1], p=[0.9, 0.1])
    } for i in range(num_users)])
    
    # Generate tweets with topics
    topics = ['politics', 'technology', 'sports', 'entertainment', 'science']
    tweets = pd.DataFrame([{
        'tweet_id': i,
        'author_id': np.random.randint(0, num_users),
        'content': fake.text(max_nb_chars=280),
        'timestamp': fake.date_time_this_year(),
        'topic': np.random.choice(topics),
        'likes': np.random.poisson(lam=50),
        'retweets': np.random.poisson(lam=10)
    } for i in range(num_tweets)])
    
    # Generate interactions
    interactions = pd.DataFrame([{
        'user_id': np.random.randint(0, num_users),
        'tweet_id': np.random.randint(0, num_tweets),
        'interaction_type': np.random.choice(['view', 'like', 'retweet', 'reply'], 
            p=[0.7, 0.2, 0.08, 0.02]),
        'timestamp': fake.date_time_this_year()
    } for _ in range(num_interactions)])
    
    return users, tweets, interactions


Generate synthetic data

In [5]:
users, tweets, interactions = generate_synthetic_data()

In [6]:
users

Unnamed: 0,user_id,join_date,location,bio,verified
0,0,2020-03-15,Dickersonberg,When certainly movie use cold most memory fede...,0
1,1,2022-04-13,North Heather,Purpose author who story region maintain.,0
2,2,2024-10-17,Port Nicholaschester,Oil page model concern form surface above.,0
3,3,2021-07-05,Davidville,Speech trip walk buy care.,0
4,4,2022-09-14,New Jonton,Ability admit choose customer computer lose th...,0
...,...,...,...,...,...
995,995,2024-04-25,South Patrickville,Success grow something letter group natural fo...,0
996,996,2023-11-10,New Kimberlyview,Start instead radio.,0
997,997,2023-03-05,Scotttown,Red run tree require style discuss interesting.,0
998,998,2023-12-16,Foxberg,Area model perhaps court so sound your level.,0


In [36]:
users.rename(columns={'user_id': 'author_id'}, inplace=True)

In [37]:
users

Unnamed: 0,author_id,join_date,location,bio,verified
0,0,2020-03-15,Dickersonberg,When certainly movie use cold most memory fede...,0
1,1,2022-04-13,North Heather,Purpose author who story region maintain.,0
2,2,2024-10-17,Port Nicholaschester,Oil page model concern form surface above.,0
3,3,2021-07-05,Davidville,Speech trip walk buy care.,0
4,4,2022-09-14,New Jonton,Ability admit choose customer computer lose th...,0
...,...,...,...,...,...
995,995,2024-04-25,South Patrickville,Success grow something letter group natural fo...,0
996,996,2023-11-10,New Kimberlyview,Start instead radio.,0
997,997,2023-03-05,Scotttown,Red run tree require style discuss interesting.,0
998,998,2023-12-16,Foxberg,Area model perhaps court so sound your level.,0


In [7]:
tweets

Unnamed: 0,tweet_id,author_id,content,timestamp,topic,likes,retweets
0,0,576,Those section apply top cup make. Amount write...,2025-01-05 13:49:15.259091,politics,56,9
1,1,902,Bar fish wall which information lead state. Re...,2025-01-10 22:34:11.507295,sports,57,12
2,2,536,High within program question send baby.\nExecu...,2025-03-20 23:35:12.044665,science,58,17
3,3,45,Single will collection contain center boy up. ...,2025-02-16 10:01:15.265265,sports,52,8
4,4,10,Several else eight news finish. Face worker ev...,2025-03-29 05:57:31.238311,politics,61,10
...,...,...,...,...,...,...,...
4995,4995,616,Player tree plant. Fall message note must natu...,2025-03-02 05:16:19.262778,entertainment,46,8
4996,4996,789,Bring agent direction example. Sport cause wil...,2025-03-24 23:44:47.942800,sports,60,11
4997,4997,38,Commercial energy power article value kitchen ...,2025-03-17 00:58:52.122169,technology,58,10
4998,4998,867,Attorney over specific surface. Least magazine...,2025-02-08 08:43:30.039877,science,51,9


In [8]:
interactions

Unnamed: 0,user_id,tweet_id,interaction_type,timestamp
0,273,495,view,2025-04-09 01:26:07.017974
1,439,495,view,2025-02-21 15:52:56.467010
2,569,2933,view,2025-03-15 10:12:18.485654
3,94,1964,like,2025-01-13 12:53:16.868723
4,330,2268,view,2025-01-10 15:37:11.300105
...,...,...,...,...
19995,499,1555,view,2025-03-29 04:59:40.467633
19996,189,42,like,2025-02-03 13:02:15.267284
19997,467,4012,retweet,2025-02-07 07:35:20.905668
19998,987,137,view,2025-02-23 20:53:48.306242


User Clustering

In [9]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

Create user-topic matrix

In [10]:
user_topics = interactions.merge(tweets, on='tweet_id')
user_topics = user_topics.groupby(['user_id', 'topic']).size().unstack(fill_value=0)

In [32]:
user_topics

topic,entertainment,politics,science,sports,technology,cluster
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3,2,5,8,2,0
1,5,2,7,2,5,3
2,5,3,5,6,4,0
3,4,5,2,3,4,2
4,8,4,8,4,3,4
...,...,...,...,...,...,...
995,4,4,5,5,5,0
996,1,2,5,5,2,0
997,1,2,4,2,2,2
998,1,4,2,5,4,2


Cluster users

In [11]:
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(user_topics)
user_topics['cluster'] = clusters

Candidate Sourcing

In [12]:
def get_candidate_tweets(target_user_id, tweets, user_topics, num_candidates=500):
    # Get user cluster
    cluster = user_topics.loc[target_user_id]['cluster']
    
    # Get tweets from same cluster users
    cluster_users = user_topics[user_topics['cluster'] == cluster].index
    cluster_tweets = tweets[tweets['author_id'].isin(cluster_users)]
    
    # Get popular tweets from all clusters
    popular_tweets = tweets.sort_values('likes', ascending=False).head(200)
    
    # Combine candidates
    candidates = pd.concat([cluster_tweets, popular_tweets]).drop_duplicates()
    return candidates.sample(min(num_candidates, len(candidates)))


Ranking Model

Prepare training data

In [13]:
interactions['target'] = interactions['interaction_type'].apply(
    lambda x: 1 if x in ['like', 'retweet', 'reply'] else 0)

merged_data = interactions.merge(users, on='user_id')
merged_data = merged_data.merge(tweets, on='tweet_id')

Feature engineering

In [14]:
features = merged_data[[
    'verified', 'likes', 'retweets',
    'topic' 
]]
target = merged_data['target']


Preprocessing pipeline

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['verified', 'likes', 'retweets']),
        ('cat', OneHotEncoder(), ['topic'])
])

Build model pipeline

In [16]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced'))
])

Train model

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

Post-Rules Application

In [54]:
def apply_post_rules(ranked_tweets, max_authors=3, banned_keywords=['spam', 'scam']):
    # Filter banned keywords
    clean_tweets = ranked_tweets[
        ~ranked_tweets['content'].str.contains('|'.join(banned_keywords), case=False)]
    
    # Limit authors
    clean_tweets = clean_tweets.sort_values(by='author_id', 
        key=lambda x: x.map(x.value_counts()))
    clean_tweets = clean_tweets.drop_duplicates(subset='author_id', keep='first')
    
    # Ensure topic diversity
    final_tweets = clean_tweets.sort_values(
        by=['predicted_score', 'topic'], 
        ascending=[False, True]
    ).head(50)
    
    return final_tweets


In [55]:
target_user = np.random.choice(users['author_id'])
candidates = get_candidate_tweets(target_user, tweets, user_topics)
candidates.merge(users, on='author_id')

Unnamed: 0,tweet_id,author_id,content,timestamp,topic,likes,retweets,join_date,location,bio,verified
0,3676,941,Despite over foot newspaper have score. Avoid ...,2025-03-11 07:21:11.875834,politics,34,9,2024-12-28,Scottbury,Feeling just federal mouth want matter.,0
1,1691,246,Eat may determine stock discuss feel statement...,2025-01-08 07:28:48.429219,entertainment,57,7,2022-05-24,Bellview,Lose law glass current entire.,0
2,2777,368,Surface floor animal probably cover. Skill eve...,2025-01-10 02:24:34.590872,sports,52,6,2021-01-24,New Markchester,Bill would laugh war important.,1
3,2929,134,Different family whole mouth level. Same nothi...,2025-03-21 11:53:51.120715,technology,63,9,2022-06-05,Lake Heatherhaven,Friend painting now approach name lose.,0
4,3364,339,Tax heart realize hot take so. Center very eve...,2025-02-24 17:09:14.835957,politics,44,10,2025-04-10,Hernandezborough,Fight different figure feel turn executive admit.,0
...,...,...,...,...,...,...,...,...,...,...,...
495,4822,312,Manage remember computer partner pass which. W...,2025-02-25 18:35:30.890441,technology,58,8,2024-11-19,Port Angela,Become push southern contain accept save pull.,0
496,4315,742,Happy successful machine difficult more state ...,2025-02-09 06:11:28.645900,politics,53,8,2024-08-24,Martinmouth,Low travel remember easy.,0
497,4333,600,Forget foreign speak model surface ever respon...,2025-02-17 10:15:35.195937,entertainment,47,5,2023-11-21,Julieport,Center similar rock affect environment church ...,1
498,1178,720,Town here court recent. Player car official na...,2025-01-27 18:26:39.753714,sports,48,6,2022-12-07,Port Sarahport,Goal might phone service.,0


Full Pipeline

In [56]:
def recommend_tweets(target_user_id, users, tweets, interactions, model):
    # Step 1: Candidate sourcing
    candidates = get_candidate_tweets(target_user_id, tweets, user_topics)
    candidates = candidates.merge(users, on='author_id')
    
    # Step 2: Feature preparation
    candidate_features = candidates[[
        'verified', 'likes', 'retweets', 'topic'
    ]]
    
    # Step 3: Ranking predictions
    predictions = model.predict_proba(candidate_features)[:, 1]
    candidates['predicted_score'] = predictions
    
    # Step 4: Apply post-rules
    ranked_tweets = candidates.sort_values('predicted_score', ascending=False)
    final_recommendations = apply_post_rules(ranked_tweets)
    
    return final_recommendations

In [59]:
target_user = np.random.choice(users['author_id'])
recommendations = recommend_tweets(target_user, users, tweets, interactions, model)
print(f"Recommendations for user {target_user}:")
print(recommendations[['tweet_id', 'content', 'predicted_score']].head(10))

Recommendations for user 134:
     tweet_id                                            content  \
31       3953  Why build degree. Capital beautiful factor mot...   
134      1188  Back but develop position. Focus buy Republica...   
195      2977  Standard federal time market half heavy option...   
472      3999  Effect effort during position. Until street re...   
414      1733  Development him expect former begin past. Comp...   
39       1128  Page friend theory team hotel article surface....   
138      2626  Score song leave face memory. Out soon total b...   
72        973  If leader final book finish race. Mention oper...   
296      4106  Method many toward your. Do little item. Reaso...   
382       365  Crime radio call study cut. Kind all some char...   

     predicted_score  
31          0.972337  
134         0.951104  
195         0.933842  
472         0.902509  
414         0.892429  
39          0.886283  
138         0.878132  
72          0.877452  
296         0.