In [6]:
import pandas as pd
import numpy as np

# --- Load behaviors.tsv ---
behaviors = pd.read_csv('/content/behaviors.tsv', sep='\t',
                        names=['ImpressionID', 'UserID', 'Time', 'History', 'Impression'])

print("\n🧠 behaviors.tsv - Sample")
print(behaviors.head())
print("\nColumns:", behaviors.columns.tolist())

# --- Load news.tsv (with error handling) ---
news = pd.read_csv('/content/news.tsv', sep='\t',
                   names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'],
                   on_bad_lines='skip',
                   encoding='utf-8')

print("\n📰 news.tsv - Sample")
print(news.head())
print("\nColumns:", news.columns.tolist())

# --- Load entity_embedding.vec ---
def load_embedding(path):
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 2:
                continue
            key = parts[0]
            vector = np.array(list(map(float, parts[1:])))
            embeddings[key] = vector
    return embeddings

entity_embedding = load_embedding('/content/entity_embedding.vec')
relation_embedding = load_embedding('/content/relation_embedding.vec')

print("\n🔗 entity_embedding.vec - Sample")
for k in list(entity_embedding.keys())[:3]:
    print(f"{k}: {entity_embedding[k][:5]}... (dim = {len(entity_embedding[k])})")

print("\n🔗 relation_embedding.vec - Sample")
for k in list(relation_embedding.keys())[:3]:
    print(f"{k}: {relation_embedding[k][:5]}... (dim = {len(relation_embedding[k])})")



🧠 behaviors.tsv - Sample
   ImpressionID  UserID                   Time  \
0             1  U13740  11/11/2019 9:05:58 AM   
1             2  U91836  11/12/2019 6:11:30 PM   
2             3  U73700  11/14/2019 7:01:48 AM   
3             4  U34670  11/11/2019 5:28:05 AM   
4             5   U8125  11/12/2019 4:11:21 PM   

                                             History  \
0  N55189 N42782 N34694 N45794 N18445 N63302 N104...   
1  N31739 N6072 N63045 N23979 N35656 N43353 N8129...   
2  N10732 N25792 N7563 N21087 N41087 N5445 N60384...   
3  N45729 N2203 N871 N53880 N41375 N43142 N33013 ...   
4                        N10078 N56514 N14904 N33740   

                                          Impression  
0                                  N55689-1 N35729-0  
1  N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...  
2  N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...  
3                N35729-0 N33632-0 N49685-1 N27581-0  
4  N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...  

Colum

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 1. Load news.tsv
news = pd.read_csv('/content/news.tsv', sep='\t',
                   names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'],
                   on_bad_lines='skip')

# 2. Keep only Title and Category
news = news[['Title', 'Category']].dropna()

# 3. Encode features (Title)
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(news['Title'])

# 4. Encode labels (Category)
y = news['Category']

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train classifier
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred))



✅ Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

        autos       0.81      0.52      0.63       560
entertainment       0.83      0.40      0.54       195
      finance       0.65      0.55      0.60      1003
 foodanddrink       0.78      0.74      0.76       841
       health       0.75      0.57      0.65       596
         kids       0.00      0.00      0.00         4
    lifestyle       0.63      0.47      0.54       783
       movies       0.83      0.36      0.50       212
        music       0.84      0.44      0.57       282
         news       0.67      0.87      0.76      5459
       sports       0.86      0.94      0.90      5048
       travel       0.62      0.40      0.49       806
           tv       0.76      0.39      0.51       288
        video       0.53      0.13      0.21       709
      weather       0.74      0.68      0.70       690

     accuracy                           0.74     17476
    macro avg       0.69      0.50      0.56     17476
 weighte

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
