In [1]:
!pip install numpy pandas faker scikit-learn nltk



In [2]:
import numpy as np
import pandas as pd
from faker import Faker
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re

In [3]:
# Initialize Faker and download NLTK resources
fake = Faker()
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/snt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/snt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Data Generation

In [34]:
def generate_synthetic_data(num_users=1000, num_tweets=5000, num_interactions=20000, num_products=200):
    # Generate users
    users = pd.DataFrame([{
        'user_id': i,
        'join_date': fake.date_this_decade(),
        'location': fake.city(),
        'bio': fake.sentence(),
        'verified': np.random.choice([0, 1], p=[0.9, 0.1])
    } for i in range(num_users)])
    
    # Generate tweets with topics
    topics = ['politics', 'technology', 'sports', 'entertainment', 'science']
    tweets = pd.DataFrame([{
        'tweet_id': i,
        'author_id': np.random.randint(0, num_users),
        'content': fake.text(max_nb_chars=280),
        'timestamp': fake.date_time_this_year(),
        'topic': np.random.choice(topics),
        'likes': np.random.poisson(lam=50),
        'retweets': np.random.poisson(lam=10)
    } for i in range(num_tweets)])
    
    # Generate interactions
    interactions = pd.DataFrame([{
        'user_id': np.random.randint(0, num_users),
        'tweet_id': np.random.randint(0, num_tweets),
        'interaction_type': np.random.choice(['view', 'like', 'retweet', 'reply'], 
            p=[0.7, 0.2, 0.08, 0.02]),
        'timestamp': fake.date_time_this_year()
    } for _ in range(num_interactions)])
    
    # Generate products
    product_categories = ['electronics', 'books', 'fashion', 'home goods', 'sports equipment']
    products = pd.DataFrame([{
        'product_id': i,
        'product_name': fake.catch_phrase(),
        'category': np.random.choice(product_categories),
        'price': np.random.uniform(10, 500),
        'rating': np.random.normal(4.5, 0.5)
    } for i in range(num_products)])
    
    # Add product mentions to tweets
    tweets['mentioned_product'] = np.random.choice(
        [-1] + list(range(num_products)), 
        size=num_tweets,
        p=[0.7] + [0.3/num_products]*num_products
    )
    
    return users, tweets, interactions, products


Generate synthetic data

In [35]:
users, tweets, interactions, products = generate_synthetic_data()

In [36]:
users

Unnamed: 0,user_id,join_date,location,bio,verified
0,0,2025-04-03,Michelleton,Present detail effort break right bad cell.,0
1,1,2023-01-30,Mauricefurt,Short player lose.,0
2,2,2020-10-15,Port Robert,Character ok him space big well year require.,1
3,3,2021-01-31,West Brittanyland,Others who field here write just.,0
4,4,2021-08-31,Nicholasside,Audience boy rather event position teacher high.,1
...,...,...,...,...,...
995,995,2023-02-21,Mullenmouth,Eight human class woman film.,0
996,996,2020-07-07,Aaronbury,Here collection reflect least give reveal area.,0
997,997,2020-05-14,Shannonbury,Performance above eight in.,0
998,998,2023-04-04,Ellenhaven,Serve expert here police in step.,0


In [23]:
tweets

Unnamed: 0,tweet_id,author_id,content,timestamp,topic,likes,retweets,mentioned_product
0,0,658,Leg actually tonight training. For success spe...,2025-01-28 08:48:32.046108,politics,55,7,-1
1,1,475,Improve wife nature way teacher red seven. He ...,2025-03-06 08:24:28.337057,sports,56,7,-1
2,2,883,Area control market fall sea exactly school be...,2025-03-22 08:24:05.836072,sports,54,9,-1
3,3,675,Someone head want nearly must.\nChoose electio...,2025-03-07 10:17:34.801566,sports,59,13,-1
4,4,698,Yard glass present art alone expect. Stock rel...,2025-04-01 13:39:31.088047,sports,40,7,-1
...,...,...,...,...,...,...,...,...
4995,4995,603,Near third through trial. Red Republican event...,2025-03-27 12:42:27.266684,sports,50,10,-1
4996,4996,324,While degree even now material daughter. Manag...,2025-02-04 15:10:40.600136,sports,58,12,179
4997,4997,226,Over reflect oil. Ground there student physica...,2025-03-18 22:01:32.970505,sports,45,11,199
4998,4998,888,Claim raise though my number administration ch...,2025-02-28 20:32:49.597584,entertainment,45,13,71


In [24]:
interactions

Unnamed: 0,user_id,tweet_id,interaction_type,timestamp
0,12,345,view,2025-03-01 23:07:13.836686
1,894,4922,view,2025-02-07 05:20:40.259222
2,876,1890,view,2025-03-24 12:06:41.237463
3,703,1737,view,2025-01-24 06:46:44.487447
4,634,3036,retweet,2025-01-08 09:01:58.078832
...,...,...,...,...
19995,836,2473,view,2025-01-28 06:38:39.232970
19996,134,3630,view,2025-02-14 21:20:12.272506
19997,931,4561,view,2025-02-09 11:42:40.116670
19998,865,3697,view,2025-01-01 13:17:47.508510


In [25]:
products

Unnamed: 0,product_id,product_name,category,price,rating
0,0,Centralized discrete collaboration,books,189.483446,4.546454
1,1,Adaptive coherent process improvement,books,476.467030,4.453028
2,2,Realigned 5thgeneration structure,home goods,410.641902,4.610612
3,3,Mandatory bi-directional instruction set,fashion,20.497259,4.143088
4,4,Versatile regional workforce,books,367.512114,4.337192
...,...,...,...,...,...
195,195,Multi-channeled zero-defect solution,home goods,468.258925,5.445938
196,196,Adaptive multimedia adapter,sports equipment,49.866497,4.082463
197,197,Ameliorated context-sensitive migration,fashion,106.012140,4.955438
198,198,Total 24hour productivity,fashion,95.658405,4.530035


User Clustering

In [37]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

Create user-topic matrix

In [27]:
user_topics = interactions.merge(tweets, on='tweet_id')
user_topics = user_topics.groupby(['user_id', 'topic']).size().unstack(fill_value=0)

In [28]:
user_topics

topic,entertainment,politics,science,sports,technology
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4,5,3,4,3
1,5,2,3,6,3
2,4,5,6,4,5
3,2,6,4,6,4
4,6,2,4,4,7
...,...,...,...,...,...
995,0,4,0,3,4
996,1,5,4,6,4
997,4,3,1,4,8
998,1,6,5,1,5


Cluster users

In [38]:
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(user_topics)
user_topics['cluster'] = clusters

TypeError: Feature names are only supported if all input features have string names, but your input has ['str', 'str_'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

Candidate Sourcing

In [30]:
def get_candidate_tweets(target_user_id, tweets, user_topics, num_candidates=500):
    # Get user cluster
    cluster = user_topics.loc[target_user_id]['cluster']
    
    # Get tweets from same cluster users
    cluster_users = user_topics[user_topics['cluster'] == cluster].index
    cluster_tweets = tweets[tweets['author_id'].isin(cluster_users)]
    
    # Get popular tweets from all clusters
    popular_tweets = tweets.sort_values('likes', ascending=False).head(200)
    
    # Combine candidates
    candidates = pd.concat([cluster_tweets, popular_tweets]).drop_duplicates()
    return candidates.sample(min(num_candidates, len(candidates)))


Ranking Model

Prepare training data

In [39]:
interactions['target'] = interactions['interaction_type'].apply(
    lambda x: 1 if x in ['like', 'retweet', 'reply'] else 0)

merged_data = interactions.merge(users, on='user_id')
merged_data = merged_data.merge(tweets, on='tweet_id')

In [40]:
users.rename(columns={'user_id': 'author_id'}, inplace=True)

Feature engineering

In [41]:
features = merged_data[[
    'verified', 'likes', 'retweets',
    'topic' 
]]
target = merged_data['target']


Preprocessing pipeline

In [42]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['verified', 'likes', 'retweets']),
        ('cat', OneHotEncoder(), ['topic'])
])

Build model pipeline

In [43]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced'))
])

Train model

In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

Post-Rules Application

In [45]:
def apply_post_rules(ranked_tweets, max_authors=3, banned_keywords=['spam', 'scam']):
    # Filter banned keywords
    clean_tweets = ranked_tweets[
        ~ranked_tweets['content'].str.contains('|'.join(banned_keywords), case=False)]
    
    # Limit authors
    clean_tweets = clean_tweets.sort_values(by='author_id', 
        key=lambda x: x.map(x.value_counts()))
    clean_tweets = clean_tweets.drop_duplicates(subset='author_id', keep='first')
    
    # Ensure topic diversity
    final_tweets = clean_tweets.sort_values(
        by=['predicted_score', 'topic'], 
        ascending=[False, True]
    ).head(50)
    
    return final_tweets


In [46]:
target_user = np.random.choice(users['author_id'])
candidates = get_candidate_tweets(target_user, tweets, user_topics)
candidates.merge(users, on='author_id')

Unnamed: 0,tweet_id,author_id,content,timestamp,topic,likes,retweets,mentioned_product,join_date,location,bio,verified
0,4184,88,Drug hear office special blood official after ...,2025-01-21 05:58:52.770650,science,62,10,-1,2022-09-17,East Cassandraburgh,With left likely first paper.,0
1,4407,677,Environmental about kid wife. Each whatever on...,2025-02-21 15:09:39.840482,technology,48,14,146,2023-02-17,North Emily,Threat put well question throughout marriage p...,0
2,3094,857,Stop of economy out pressure minute task.\nThe...,2025-04-14 10:03:20.134209,sports,59,6,-1,2024-09-07,Johnsonbury,Budget bill thousand oil speech read.,0
3,4907,580,Win and skill history. Everyone new modern rat...,2025-02-13 23:57:17.760742,science,41,10,44,2024-11-24,Tiffanyport,Drive international college each hot someone p...,0
4,3586,828,Century I reduce enough writer future major. R...,2025-01-17 21:44:25.938478,technology,55,8,-1,2023-01-03,South Juanfort,Imagine from it I wish.,0
...,...,...,...,...,...,...,...,...,...,...,...,...
495,1717,635,Class adult lead both. Mouth woman decade toni...,2025-03-08 10:28:06.931328,science,51,9,-1,2024-12-19,West Mariashire,Appear economy Congress still.,0
496,285,780,Land institution executive religious. Third te...,2025-03-05 14:15:29.999519,sports,43,8,154,2022-12-11,New Victoriaborough,Particularly arrive feeling school.,0
497,480,824,Relationship high current. Only position press...,2025-03-12 22:49:38.643512,sports,46,12,8,2025-03-25,North Michelleville,Help itself back heart.,0
498,2820,28,Next not college painting follow next availabl...,2025-03-18 15:41:14.666482,sports,54,12,-1,2021-04-16,Nelsonview,Part owner full turn.,0


In [47]:
def recommend_products(target_user_id, user_topics, tweets, products, top_n=5):
    # Get user cluster
    cluster = user_topics.loc[target_user_id]['cluster']
    
    # Get popular products in cluster
    cluster_tweets = tweets[tweets['author_id'].isin(
        user_topics[user_topics['cluster'] == cluster].index
    )]
    product_mentions = cluster_tweets[cluster_tweets['mentioned_product'] != -1]
    
    # Get top products by engagement
    product_engagement = product_mentions.merge(
        interactions, on='tweet_id'
    ).groupby('mentioned_product').size().reset_index(name='counts')
    
    # Merge with product details and sort
    recommendations = product_engagement.merge(
        products, left_on='mentioned_product', right_on='product_id'
    ).sort_values(['counts', 'rating'], ascending=False)
    
    return recommendations.head(top_n)

Full Pipeline

In [50]:
def recommend_tweets(target_user_id, users, tweets, interactions, model):
    # Step 1: Candidate sourcing
    candidates = get_candidate_tweets(target_user_id, tweets, user_topics)
    candidates = candidates.merge(users, on='author_id')
    
    # Step 2: Feature preparation
    candidate_features = candidates[[
        'verified', 'likes', 'retweets', 'topic'
    ]]
    
    # Step 3: Ranking predictions
    predictions = model.predict_proba(candidate_features)[:, 1]
    candidates['predicted_score'] = predictions
    
    # Step 4: Apply post-rules
    ranked_tweets = candidates.sort_values('predicted_score', ascending=False)
    final_recommendations = apply_post_rules(ranked_tweets)
    
    return final_recommendations

In [51]:
target_user = np.random.choice(users['author_id'])
recommendations = recommend_tweets(target_user, users, tweets, interactions, model)
product_recommendations = recommend_products(target_user, user_topics, tweets, products)
print(f"Tweet Recommendations for user {target_user}:")
print(recommendations[['tweet_id', 'content', 'predicted_score']].head(10))

print(f"Product Recommendations for user {target_user}:")
print(product_recommendations[['product_id', 'product_name', 'category', 'rating']].head(10))

Tweet Recommendations for user 502:
     tweet_id                                            content  \
384       579  Hair grow letter sure eat two early factor. Re...   
161      2019  Edge whom design.\nWhy wide later professional...   
312       499  Use character under development time indeed wa...   
7        1056  For perform minute rate no rather page say. So...   
383      4697  Local weight training customer must now. Contr...   
273      1561  West business might young. Generation hundred ...   
340      1788  Together good society remain should miss style...   
465      1174  Hit beat old budget blue. Director ok never do...   
106      1453  Six whom task admit everybody entire able. Sub...   
21        968  Agent professional outside fill world some. At...   

     predicted_score  
384         0.976758  
161         0.963656  
312         0.920292  
7           0.871118  
383         0.854440  
273         0.822369  
340         0.819907  
465         0.796780  
106     