In [42]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
import time
import csv
import os
import sys
from nltk.corpus import stopwords
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

In [46]:
class FeatureEngineer:
    def __init__(self, tfidf=True, number_of_words=False, number_of_exclamatory=False):
        self.tfidf = tfidf
        self.number_of_words = number_of_words
        self.number_of_exclamatory = number_of_exclamatory
        self.tfidf_vectorizer = TfidfVectorizer(max_df=0.85,min_df=0.15)
    
    def encode(self, process_input_filename=None, original_input_filename=None, output_filename=None, istrain=True):
        data, labels = self.read_data(process_input_filename, col1='text', col2='label')
        odata, olabels = self.read_data(original_input_filename, col1=1, col2=0, header=False)
        combination = []
        if self.tfidf:
            combination.append(self.tf(data, labels, istrain))
        if self.number_of_words:
            combination.append(self.numofwords(data))
        if self.number_of_exclamatory:
            combination.append(self.numofexclam(odata))
            
        matrix = [[] for _ in range(len(data))]
        for k in range(len(combination)):
            for i in range(len(combination[k])):
                for j in combination[k][i]:
                    matrix[i].append(j)   
        self.generate(matrix, labels, output_filename)
    
    def generate(self, matrix, labels, output_filename):
        with open(output_filename, mode='w', newline='') as file:
            writer = csv.writer(file)
            fieldnames = ['id'] + [('d' + str(i)) for i in range(1, len(matrix[0]) + 1)] + ['label']
            writer.writerow(fieldnames)
            for i, row in enumerate(matrix):
                new_row = [i]
                for r in row:
                    new_row.append(r)
                new_row.append(labels[i])
                writer.writerow(new_row)
    
    def read_data(self, filename, col1, col2, header=True):
        if not header:
            raw_data = pd.read_csv(filename, header=None)
        else:
            raw_data = pd.read_csv(filename)
        data,labels=raw_data[col1].tolist(), raw_data[col2].tolist()
        return data, labels
    
    def tf(self, data, labels, istrain):
        if istrain:
            tfidf_matrix = self.tfidf_vectorizer.fit_transform(data)
            tfidf_matrix = tfidf_matrix.toarray()
        else:
            tfidf_matrix = self.tfidf_vectorizer.transform(data)
            tfidf_matrix = tfidf_matrix.toarray()
        return tfidf_matrix
    
    def numofwords(self, data):
        res = [[len(data[i].split())] for i in range(len(data))]
        return res
    
    def numofexclam(self, data):
        res = [[0] for i in range(len(data))]
        for i in range(len(data)):
            for ch in data[i]:
                if ch == '!':
                    res[i][0] += 1
        return res
        


In [47]:
fe = FeatureEngineer(tfidf=True, number_of_words=True, number_of_exclamatory=True)

In [48]:
fe.encode(process_input_filename='../dataset/process_train.csv', original_input_filename='../dataset/fulltrain.csv', output_filename='../dataset/feature_train.csv')

In [49]:
fe.encode(process_input_filename='../dataset/process_test.csv', original_input_filename='../dataset/balancedtest.csv', output_filename='../dataset/feature_test.csv',istrain=False)