In [None]:
import modin.pandas as pd
from modin.config import ProgressBar

ProgressBar.enable()
import ray

ray.init() # Start the Ray runtime for single-node

In [None]:
df_fake = pd.read_csv("fake.csv", nrows=10)
df_true = pd.read_csv("true.csv", nrows=10)

In [None]:
df_fake

In [None]:
df_true

In [None]:
df_fake.drop(columns=df_fake.columns[2:4].tolist(), inplace=True)

In [None]:
df_true.drop(columns=df_true.columns[2:4].tolist(), inplace=True)

In [None]:
df_fake["target"] = 0
df_true["target"] = 1

In [None]:
df = pd.concat([df_fake, df_true], ignore_index=True)
df

In [None]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from autocorrect import Speller
from string import punctuation
import string

stop_words = set(stopwords.words("english"))
#spell = Speller(lang="en")
stemmer = SnowballStemmer("english")
punctuation += "’"


def preprocess_corpus(text):
    
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
#    tokens = [spell(token) for token in tokens]
    tokens = [token for token in tokens if token not in punctuation]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if not token.isdigit()]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

In [None]:
new_df = pd.DataFrame(columns=["title", "text"])

In [None]:
new_df["title"] = df["title"].map(preprocess_corpus)

In [None]:
new_df["text"] = df["text"].map(preprocess_corpus)

In [None]:
new_df

In [None]:
###################

In [None]:
from gensim.models import Word2Vec, KeyedVectors  

path = "GoogleNews-vectors-negative300.bin.gz"
model = KeyedVectors.load_word2vec_format(path, binary=True)

In [None]:
import numpy as np


def sentence_vector(sentence): # result is a (1,300) vector
    word_vectors = []
    for word in sentence:
        try:
            word_vector = model.get_vector(word.lower())
            word_vectors.append(word_vector)
        except KeyError:
            continue
    matrix = np.array(word_vectors)
    avg = np.mean(matrix[:, :], axis=0)
    return avg

def get_vector_array(ColName):
    working_series = new_df[ColName].map(sentence_vector)
    arr = np.array([])
    for i in working_series:
        arr = np.append(arr, i)
    return arr.reshape(int(arr.shape[0]/300), 300)

In [None]:
title = get_vector_array("title")

In [None]:
title

In [None]:
title.shape

In [None]:
text = get_vector_array("text")

In [None]:
X = np.hstack((title, text))
y = df[['target']].values


In [None]:
X.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid_test, y_train, y_valid_test = train_test_split(
    X, y, test_size=0.33, shuffle = True, random_state=42)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_valid_test, y_valid_test, test_size=0.5, shuffle = True, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.decomposition import PCA

# for simplicity, omit using cross validation with an estimator. Instead use elbow method to select the optimal number of PCA components 
pca = PCA(n_components = None)
pca.fit(X_train_scaled)

In [None]:
import matplotlib.pyplot as plt

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title('Percentage of Variance Explained')
plt.xlabel('Number of Components')
plt.ylabel('Percentage of Variance Explained')
plt.show()

In [None]:
# use ? pca components
pca = PCA(n_components = ?)
pca.fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)
X_valid_pca = pca.transform(X_valid_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
import modin.experimental.xgboost as xgb

# Create Modin DataFrames as modin's xgboost implementation expects dataframes for its dmatrix
X_train_pca_df = pd.DataFrame(X_train_pca)
X_valid_pca_df = pd.DataFrame(X_valid_pca)
X_test_pca_df = pd.DataFrame(X_test_pca)
y_train_df = pd.DataFrame(y_train)
y_valid_df = pd.DataFrame(y_valid)
y_test_df = pd.DataFrame(y_test)

# Create DMatrix
dtrain = xgb.DMatrix(X_train_pca_df, y_train_df)
dvalid = xgb.DMatrix(X_valid_pca_df, y_valid_df)
dtest = xgb.DMatrix(X_test_pca_df, y_test_df)

# Set training parameters
xgb_params = {
    "eta": 0.3,
    "max_depth": 3,
    "objective": "binary:logistic",
    "num_class": 2,
    "eval_metric": "logloss",
}
steps = 20

# Create dict for evaluation results
evals_result = dict()

# Run training
model = xgb.train(
    xgb_params,
    dtrain,
    steps,
    evals=[(dvalid, "")],
    evals_result=evals_result
)

# Print evaluation results
print(f'Evals results:\n{evals_result}')

# Predict results
prediction = model.predict(dtest)

# Print prediction results
print(f'Prediction results:\n{prediction}')