# Task 4: splitting the dataset and feature representations

Spliting datasets: dataset splitting is done with this function.

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
def split_data(X, y):
    # train/test split
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, train_size=0.8,random_state=0)
    # validation split
    X_val, X_test, Y_val, Y_test = train_test_split(X_test, Y_test, test_size=0.5,random_state=0)
    return X_train, X_test, X_val, Y_train, Y_test, Y_val

# Feature represenation: final preprocessing steps

Here, the preprocessed dataset is split, and feature representations for the models are created, and stored in .npz files.

### Feature representation for simple models

The following code box creates Feature representations for (part 2, task 1-2).

The content column of 995,000 is encoded with both countvectorizer(bag of words) and one-hot

The title column is encoded with countvectorizer(bag of words)

The domain column is encoded with one-hot

In [2]:
# for tasks 2.1-2.2
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import os
import pandas as pd

# title, domain and conent features of 995,000_rows
title_995=(pd.read_csv("../995,000_rows_preprocessed.csv",usecols=[9]).fillna('')).values
domain_995=(pd.read_csv("../995,000_rows_preprocessed.csv",usecols=[2]).fillna('')).values
content_995 = pd.read_csv("../995,000_rows_preprocessed.csv",usecols=[5]).values
Y_995 = pd.read_csv("../995,000_rows_preprocessed.csv",usecols=[3]).values
#transform types into bool: 1-real, 0-fake
Y_995 = np.isin(Y_995,['reliable', 'political']).astype(int)

#split features
X_train_content, X_test_content, X_val_content, Y_train, Y_test, Y_val = split_data(content_995,Y_995)
X_train_domain, X_test_domain, X_val_domain, _, _, _ = split_data(domain_995,Y_995)
X_train_title, X_test_title, X_val_title, _, _, _ = split_data(title_995,Y_995)
del title_995,domain_995,content_995

#one-hot for content: these parameters exclude features with frequency of under 100.
onehot_encoder1 = OneHotEncoder(min_frequency=100,handle_unknown='infrequent_if_exist')
#one-hot for domain: default settings.
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
#count_vectorizer (bag of words) for content: these parameters excludes features with frequency of under 100, and the words in stop_words.
count_vectorizer = CountVectorizer(min_df=100,lowercase= False, stop_words=['NUM', 'URL', 'EMAIL', 'DATE'])
#count_vectorizer for title: default settings.
count_vectorizer1 = CountVectorizer()

# Fit and transform
X_train_content_BOW = count_vectorizer.fit_transform((X_train_content.reshape(-1)))
X_test_content_BOW = count_vectorizer.transform((X_test_content.reshape(-1)))
X_val_content_BOW = count_vectorizer.transform((X_val_content.reshape(-1)))
del count_vectorizer

X_train_content_ONEHOT = onehot_encoder1.fit_transform((X_train_content.reshape(-1,1)))
X_test_content_ONEHOT = onehot_encoder1.transform((X_test_content.reshape(-1,1)))
X_val_content_ONEHOT = onehot_encoder1.transform((X_val_content.reshape(-1,1)))
del onehot_encoder1,X_train_content,X_test_content,X_val_content

X_train_title = count_vectorizer1.fit_transform((X_train_title.reshape(-1)))
X_test_title = count_vectorizer1.transform((X_test_title.reshape(-1)))
X_val_title = count_vectorizer1.transform((X_val_title.reshape(-1)))
del count_vectorizer1

X_train_domain = onehot_encoder.fit_transform((X_train_domain.reshape(-1, 1)))
X_test_domain = onehot_encoder.transform((X_test_domain.reshape(-1, 1)))
X_val_domain = onehot_encoder.transform((X_val_domain.reshape(-1, 1)))
del onehot_encoder

#scale content and title features with standardscaler. this is recommended for BOW/countVectorizer features
scaler = StandardScaler(with_mean=False)
X_train_content_BOW = scaler.fit_transform(X_train_content_BOW)
X_test_content_BOW = scaler.transform(X_test_content_BOW)
X_val_content_BOW = scaler.transform(X_val_content_BOW)
del scaler

scaler1 = StandardScaler(with_mean=False)
X_train_title = scaler1.fit_transform(X_train_title)
X_test_title = scaler1.transform(X_test_title)
X_val_title = scaler1.transform(X_val_title)
del scaler1

# Save to file
np.savez("../Simple_995.npz", 
         Y_train=Y_train,
         Y_test=Y_test,
         Y_val=Y_val,
         X_train_content_BOW=X_train_content_BOW, 
         X_test_content_BOW=X_test_content_BOW, 
         X_val_content_BOW=X_val_content_BOW,
         X_train_content_ONEHOT=X_train_content_ONEHOT, 
         X_test_content_ONEHOT=X_test_content_ONEHOT, 
         X_val_content_ONEHOT=X_val_content_ONEHOT,
         X_train_title=X_train_title, 
         X_test_title=X_test_title, 
         X_val_title=X_val_title,
         X_train_domain=X_train_domain, 
         X_test_domain=X_test_domain, 
         X_val_domain=X_val_domain)

# Delete unnecessary variables
del Y_train, Y_test, Y_val, X_train_content_BOW, X_test_content_BOW, X_val_content_BOW, X_train_content_ONEHOT, X_test_content_ONEHOT, X_val_content_ONEHOT, X_train_title, X_test_title, X_val_title, X_train_domain, X_test_domain, X_val_domain


### Feature representation for simple model with 995,000 and BBC articles

The following code box creates Feature representations for (part 2, task 3).

CountVectorizer(bag of words) encoding for 995,000_rows combined with scraped BBC articles

In [2]:
#for task 2.3
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import os
import pandas as pd

#read content and types of 995,000_rows and BBC articles
content_995_BBC = np.vstack((pd.read_csv("../995,000_rows_preprocessed.csv",usecols=[5]).values,pd.read_csv("../BBC_articles_preprocessed.csv",usecols=[2]).values))
Y_995_BBC = np.vstack((pd.read_csv("../995,000_rows_preprocessed.csv",usecols=[3]).values,pd.read_csv("../BBC_articles_preprocessed.csv",usecols=[1]).values))
#transform types into bool: 1-real, 0-fake
Y_995_BBC = np.isin(Y_995_BBC,['reliable', 'political']).astype(int)

#split the dataset
X_train, X_test, X_val, Y_train, Y_test, Y_val = split_data(content_995_BBC,Y_995_BBC)
del content_995_BBC,Y_995_BBC

#bag of words:these parameters excludes features with frequency of under 100, and the words in stop_words.
count_vectorizer = CountVectorizer(min_df=100,lowercase= False,stop_words=['NUM', 'URL', 'EMAIL', 'DATE'])
# Fit and transform 
X_train = count_vectorizer.fit_transform(X_train.reshape(-1))
X_test = count_vectorizer.transform(X_test.reshape(-1))
X_val = count_vectorizer.transform(X_val.reshape(-1))

#scale content features. this is recommended for BOW/countVectorizer features
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_val = scaler.fit_transform(X_val)

#write to file
np.savez("../Simple_995_BBC.npz", X_train=X_train, X_test=X_test, X_val=X_val, Y_train=Y_train, Y_test=Y_test, Y_val=Y_val)
del count_vectorizer, X_train, X_test, X_val, Y_train, Y_test, Y_val

### Feature representations for complex model.

The following code box creates Feature representations for (part 3).

tf-idf encoding for 995,000_rows and BBC articles

In [6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import os
import pandas as pd
import zipfile

#995,000_rows and BBC articles
content = np.vstack((pd.read_csv("../995,000_rows_preprocessed.csv",usecols=[5]).values,pd.read_csv("../BBC_articles_preprocessed.csv",usecols=[2]).values))
Y = np.vstack((pd.read_csv("../995,000_rows_preprocessed.csv",usecols=[3]),pd.read_csv("../BBC_articles_preprocessed.csv",usecols=[1]).values))
#transform types into bool: 1-real, 0-fake
Y = np.isin(Y,['reliable', 'political']).astype(int)

#split the dataset
X_train, X_test, X_val, Y_train, Y_test, Y_val = split_data(content,Y)
del content, Y

#bag of words:these parameters excludes features with frequency of under 100, and the words in stop_words.
tfidf_vectorizer = TfidfVectorizer(min_df=100,lowercase= False,stop_words=['NUM', 'URL', 'EMAIL', 'DATE'])
# Fit and transform 
X_train = tfidf_vectorizer.fit_transform(X_train.reshape(-1))
X_test = tfidf_vectorizer.transform(X_test.reshape(-1))
X_val = tfidf_vectorizer.transform(X_val.reshape(-1))
np.savez("../advanced_features.npz", X_train=X_train, X_test=X_test, X_val=X_val, Y_train=Y_train, Y_test=Y_test, Y_val=Y_val)
del X_train, X_test, X_val, Y_train, Y_test, Y_val 

#LIAR dataset
with zipfile.ZipFile("../liar_dataset.zip",'r') as zip_file:
    with zip_file.open("test.tsv") as test_tsv:
        test = pd.read_csv(test_tsv, sep='\t')
    with zip_file.open("valid.tsv") as valid_tsv:
        valid = pd.read_csv(valid_tsv, sep='\t')

#transform LIAR
X_test = tfidf_vectorizer.transform(test.iloc[:, 2].values.reshape(-1))
X_val= tfidf_vectorizer.transform(valid.iloc[:, 2].values.reshape(-1))
Y_test= test.iloc[:,1].apply(lambda x: 1 if x in ['true','mostly-true'] else 0).values
Y_val = valid.iloc[:,1].apply(lambda x: 1 if x in ['true','mostly-true'] else 0).values
np.savez("../LIAR_features.npz", X_test= X_test, X_val=X_val, Y_test=Y_test, Y_val=Y_val)
del test, valid, X_test, X_val, tfidf_vectorizer