In [1]:
from pymongo import MongoClient
import pandas as pd

MONGO_URI = "mongodb://localhost:27017/"
DB_NAME   = "News"
COLL_NAME = "rawData"     # usa procData (contiene clean/tokens/lemmas)
LABEL_COL = "state"      # oppure "state" se vuoi classificare lo state

client = MongoClient(MONGO_URI)
coll = client[DB_NAME][COLL_NAME]

# Ignoro "Unnamed: 0" direttamente in projection
FIELDS = {
    "_id": 0, "title": 1, "text": 1, "clean": 1,
    "tokens": 1, "lemmas": 1, LABEL_COL: 1
}
docs = list(coll.find({}, FIELDS))
df = pd.DataFrame(docs)

# Filtri minimi
df = df.dropna(subset=[LABEL_COL]).copy()
df[LABEL_COL] = df[LABEL_COL].astype(str)
print(df.shape)
df.head(2)


(44898, 6)


Unnamed: 0,title,text,state,clean,tokens,lemmas
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,1,donald trump just couldn t wish all americans ...,"[donald, trump, wish, americans, happy, new, y...","[donald, trump, wish, american, happy, new, ye..."
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1,house intelligence committee chairman devin nu...,"[house, intelligence, committee, chairman, dev...","[house, intelligence, committee, chairman, dev..."


In [2]:
import pandas as pd

# 1) verifica presenza colonna
if "lemmas" not in df.columns:
    raise ValueError("Colonna 'lemmas' assente nel DataFrame. Assicurati di leggere da 'rawData'.")

# 2) tieni solo righe con lemmi non nulli
df = df.dropna(subset=["lemmas"]).copy()

# 3) assicurati che ogni entry sia una lista non vuota
df = df[df["lemmas"].apply(lambda x: isinstance(x, list) and len(x) > 0)].reset_index(drop=True)

# 4) usa direttamente i lemmi
df["tokens_used"] = df["lemmas"]

print("Docs (con lemmi validi):", len(df))
print(df[[LABEL_COL, "tokens_used"]].head(2))


Docs (con lemmi validi): 44182
  state                                        tokens_used
0     1  [donald, trump, wish, american, happy, new, ye...
1     1  [house, intelligence, committee, chairman, dev...


In [3]:
from gensim.models import Word2Vec

W2V_SIZE   = 100
W2V_WINDOW = 5
W2V_MIN    = 2
W2V_SG     = 1    # skip-gram

docs = df["tokens_used"].tolist()

w2v = Word2Vec(
    sentences=docs,
    vector_size=W2V_SIZE,
    window=W2V_WINDOW,
    min_count=W2V_MIN,
    workers=4,
    sg=W2V_SG
)

print("Vocabolario:", len(w2v.wv))


Vocabolario: 63899


In [4]:
import numpy as np

def doc_vector(words, model):
    """Media dei vettori di un documento."""
    vecs = [model.wv[w] for w in words if w in model.wv]
    if len(vecs) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

X = np.vstack([doc_vector(doc, w2v) for doc in docs])
y = df.loc[:len(X)-1, "state"].values  # etichetta

print("X shape:", X.shape)
print("y sample:", y[:10])


X shape: (44182, 100)
y sample: ['1' '1' '1' '1' '1' '1' '1' '1' '1' '1']


In [5]:
X

array([[-0.14576551,  0.20846634,  0.19916436, ...,  0.07032482,
         0.04025758, -0.32057312],
       [-0.07765286,  0.15813601,  0.34665048, ...,  0.03394436,
        -0.00245274, -0.36184576],
       [ 0.00100248,  0.08843064,  0.20083056, ...,  0.12334862,
         0.02798297, -0.37150669],
       ...,
       [-0.19437018,  0.26075804,  0.26103425, ...,  0.01103982,
         0.07664921, -0.36172029],
       [-0.30282566,  0.25800672,  0.32357222, ..., -0.09570906,
         0.13183808, -0.41898799],
       [-0.33571106,  0.31191632,  0.28925517, ...,  0.01730149,
         0.11054684, -0.40369469]])

In [6]:
y

array(['1', '1', '1', ..., '0', '0', '0'], dtype=object)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

SEED = 42

models = {
    "LogReg": LogisticRegression(max_iter=2000, random_state=SEED),
    "LinearSVM": LinearSVC(random_state=SEED),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=SEED),
    "GaussianNB": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(random_state=SEED),
    "KNN": KNeighborsClassifier(n_neighbors=5)  # default k=5
}

scores = []
preds = {}

for name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred, average="macro")
    prec = precision_score(y_test, y_pred, average="macro")
    rec  = recall_score(y_test, y_pred, average="macro")
    
    print(f"\n== {name} ==")
    print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1-macro: {f1:.4f}")
    print(classification_report(y_test, y_pred))
    
    scores.append((name, acc, prec, rec, f1))
    preds[name] = y_pred

# Tabella riepilogativa
df_scores = pd.DataFrame(scores, columns=["model","accuracy","precision","recall","f1"])
print("\n=== Summary ===")
print(df_scores.sort_values("f1", ascending=False))



== LogReg ==
Accuracy: 0.9571 | Precision: 0.9570 | Recall: 0.9573 | F1-macro: 0.9571
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      4283
           1       0.96      0.95      0.96      4554

    accuracy                           0.96      8837
   macro avg       0.96      0.96      0.96      8837
weighted avg       0.96      0.96      0.96      8837


== LinearSVM ==
Accuracy: 0.9578 | Precision: 0.9577 | Recall: 0.9578 | F1-macro: 0.9578
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      4283
           1       0.96      0.96      0.96      4554

    accuracy                           0.96      8837
   macro avg       0.96      0.96      0.96      8837
weighted avg       0.96      0.96      0.96      8837


== RandomForest ==
Accuracy: 0.9614 | Precision: 0.9614 | Recall: 0.9614 | F1-macro: 0.9614
              precision    recall  f1-score   support

           0       0.

In [9]:
df_scores

Unnamed: 0,model,accuracy,precision,recall,f1
0,LogReg,0.957112,0.956984,0.957277,0.957087
1,LinearSVM,0.957791,0.957691,0.957845,0.957759
2,RandomForest,0.961412,0.961391,0.961359,0.961375
3,GaussianNB,0.900192,0.900058,0.900189,0.900116
4,DecisionTree,0.916487,0.917366,0.915839,0.916285
5,KNN,0.938101,0.938387,0.93872,0.938097


In [10]:

from pymongo import MongoClient
import numpy as np, math

client = MongoClient("mongodb://localhost:27017/")
coll = client["News"]["features2_w2v"]
coll.delete_many({})

X32 = X.astype(np.float32); y_arr = np.asarray(y)
CHUNK = 2000
for i in range(0, len(y_arr), CHUNK):
    a, b = i, min(i+CHUNK, len(y_arr))
    coll.insert_one({"idx": i//CHUNK, "X": X32[a:b].tolist(), "y": y_arr[a:b].tolist()})
coll.create_index("idx")
print("Done.")


Done.


In [11]:
import numpy as np
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")
coll = client["News"]["features2_w2v"]

X_list, y_list = [], []
for d in coll.find({}, {"_id":0}).sort("idx", 1):
    X_list.extend(d["X"])
    y_list.extend(d["y"])
X_rec = np.array(X_list, dtype=np.float32)
y_rec = np.array(y_list)
print(X_rec.shape, y_rec.shape)

(44182, 100) (44182,)


In [12]:
X_rec

array([[-0.14576551,  0.20846634,  0.19916436, ...,  0.07032482,
         0.04025758, -0.32057312],
       [-0.07765286,  0.15813601,  0.34665048, ...,  0.03394436,
        -0.00245274, -0.36184576],
       [ 0.00100248,  0.08843064,  0.20083056, ...,  0.12334862,
         0.02798297, -0.3715067 ],
       ...,
       [-0.19437018,  0.26075804,  0.26103425, ...,  0.01103982,
         0.07664921, -0.3617203 ],
       [-0.30282566,  0.25800672,  0.32357222, ..., -0.09570906,
         0.13183808, -0.418988  ],
       [-0.33571106,  0.31191632,  0.28925517, ...,  0.01730149,
         0.11054684, -0.4036947 ]], dtype=float32)