Text classification using Spacy Word Vectors

In [6]:
import pandas as pd

# Load the datasets
true_df = pd.read_csv('True.csv')
fake_df = pd.read_csv('Fake.csv')

# Keep only 'text' column and assign labels
true_df_labeled = true_df[['text']].copy()
true_df_labeled['label'] = 'real'

fake_df_labeled = fake_df[['text']].copy()
fake_df_labeled['label'] = 'fake'

# Merge both datasets
merged_df = pd.concat([true_df_labeled, fake_df_labeled], ignore_index=True)

# Optional: Shuffle the data
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

# Save the merged dataset to a CSV file
merged_df.to_csv('merged_news.csv', index=False)
df = merged_df
df.head()

Unnamed: 0,text,label
0,Tune in to the Alternate Current Radio Network...,fake
1,"First time voters, Democrats, minorities, wome...",fake
2,America is experiencing an overwhelming climat...,fake
3,"Tropical Liqueurs, otherwise known as Trops ,...",fake
4,Pro-Trump supporters have known the truth for ...,fake


In [7]:
df.shape

(44898, 2)

In [8]:
df.label.value_counts()

label
fake    23481
real    21417
Name: count, dtype: int64

In [9]:
df['label_num'] = df['label'].map({'fake': 0, 'real': 1})
df.head()
                       

Unnamed: 0,text,label,label_num
0,Tune in to the Alternate Current Radio Network...,fake,0
1,"First time voters, Democrats, minorities, wome...",fake,0
2,America is experiencing an overwhelming climat...,fake,0
3,"Tropical Liqueurs, otherwise known as Trops ,...",fake,0
4,Pro-Trump supporters have known the truth for ...,fake,0


In [10]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [None]:
  df['vector'] = df['text'].apply(lambda text:nlp(text).vector)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
train_test_split(
    df.vector,
    df.label_num,
    test_size = 0.2,
    random_state=2011
)

In [None]:
X_train.shape

In [None]:
import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

clf = MultinomialNB()
clf.fit(scaled_train_embed, y_train)

In [None]:
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
clf.fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)

print(classification_report(y_test, y_pred))

Word Vectors using Gensim

In [None]:
import gensim.downloader as api

wv = api.load("word2vec-google-news-300")

In [None]:
wv.similarity(w1='great',w2 = 'well')

In [None]:
wv.most_similar("dog")

In [None]:
King - man + woman = Queen

In [None]:
wv.most_similar(positive=["King", "woman"], negative=["man"])

In [None]:
wv.doesnt_match(["facebook","cat","google", "microsoft"])

In [None]:
glv = api.load("glove-twitter-25")

In [None]:
glv.most_similar("good")