In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /home/angelo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def makeData(n_samples=93240,n_features=11,dataPath="News_Final.csv"):
    data = pd.read_csv(dataPath)
    data = data.to_numpy()
    
    labels = data[:n_samples,-3:]
    X = data[:n_samples,:-3]
    
    # omitting rows with no usable labels
    bool_matrix = np.any(labels != 0, axis=1)
    labels = labels[bool_matrix]
    X = X[bool_matrix]
    
    bool_matrix = labels[:, 1] != -1
    labels = labels[bool_matrix]
    X = X[bool_matrix]
    
    
    return X,labels

In [10]:
def splitData(X, labels, test_size, seed):
    np.random.seed(seed)
    
    indices = range(X.shape[0])
    train_indices = np.random.choice(indices,size = int(len(indices) * (1-test_size)),replace=False)
    test_indices = np.setdiff1d(indices,train_indices)
    
    X_train = X[train_indices]
    X_test = X[test_indices]  
    
    y_train = labels[train_indices]
    y_test = labels[test_indices]
    
    return X_train,X_test,y_train,y_test

In [11]:
from pandas.core.internals.blocks import final
class NewsData(object):
    def __init__(self):
        self.X, self.labels = makeData(15000)
        self.X_train, self.X_test, self.y_train, self.y_test = splitData(self.X,self.labels, 0.2, 2023)
        self.encoder = OneHotEncoder()
        self.vectorizer = TfidfVectorizer()
        self.tfidf = None
        self.corpus = []
        
    def OneHotEncodeTrain(self, index):
        col = np.array(self.X_train[:,index]).reshape(-1, 1)
        self.encoder.fit(col)
        one_hot_data = self.encoder.transform(col).toarray()
        for i in range(self.X_train.shape[0]):
            self.X_train[i][index] = list(one_hot_data[i])
            
    def OneHotEncodeTest(self, index):
        col = np.array(self.X_test[:,index]).reshape(-1, 1)
        self.encoder.fit(col)
        one_hot_data = self.encoder.transform(col).toarray()
        for i in range(self.X_test.shape[0]):
            self.X_test[i][index] = list(one_hot_data[i])
    
    def getCorpus(self, index):
        col = np.array(self.X_train[:,index]).reshape(-1, 1)
        col = [str(x) for x in col]

        row_strings = [''.join(row) for row in col]
        final_string = ''.join(row_strings)

        final_string = word_tokenize(final_string)
        final_string = np.unique(final_string)

        self.corpus.append(final_string.tolist())

    def combineCorpus(self):
        i = 1
        while i < len(self.corpus):
            for x in self.corpus[i]:
                self.corpus[0].append(str(x))
            i+=1
        self.corpus = self.corpus[0]
        self.corpus = np.unique(self.corpus).tolist()
        self.corpus = pd.DataFrame(self.corpus, columns=['words'])

    def vectorizeTrain(self, index):
        if self.tfidf is None:
            self.tfidf = self.vectorizer.fit_transform(self.corpus['words'].values.astype('U'))
        for i in range(self.X_train.shape[0]):
            if type(newsData.X_train[i][index]) != type('str'):
                newsData.X_train[i][index] = ""
            self.X_train[i][index] = newsData.vectorizer.transform([newsData.X_train[i][index]])
    
    def vectorizeTest(self, index):
        if self.tfidf is None:
            self.tfidf = self.vectorizer.fit_transform(self.corpus['words'].values.astype('U'))
        for i in range(self.X_test.shape[0]):
            if type(newsData.X_test[i][index]) != type('str'):
                newsData.X_test[i][index] = ""
            self.X_test[i][index] = newsData.vectorizer.transform([newsData.X_test[i][index]])

        return self.tfidf.toarray()

In [12]:
newsData = NewsData()
newsData.OneHotEncodeTrain(3)
newsData.OneHotEncodeTrain(4)

newsData.OneHotEncodeTest(3)
newsData.OneHotEncodeTest(4)

newsData.getCorpus(1)
newsData.getCorpus(2)
newsData.combineCorpus()

newsData.vectorizeTrain(1)
newsData.vectorizeTrain(2)

newsData.vectorizeTest(1)
newsData.vectorizeTest(2)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])