In [1]:
import numpy as np
import pandas as pd
import re

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import stem_text

from sklearn.feature_extraction.text import CountVectorizer

from scipy import sparse
from scipy.sparse import csr_matrix

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/traintestpkls-codeproject/train_pickle.pkl
/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl
/kaggle/input/traintestpkls-codeproject/test_pickle.pkl


In [2]:
def to_lowercase(text):
    """
    Args: text PostText column of 
    Returns:
    
    Description:
    """
    return text.str.lower()

def remove_escape_seqs(s):
    ansi_escape = re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]')
    return ansi_escape.sub('', re.sub("[^a-zA-Z' ]+", ' ', s))

def remove_puncs_and_nums(s):
    repaired_words = [''.join(ch for ch in word if ch.isalnum() and not ch.isdigit()) for word in s.split()]
    return ' '.join(repaired_words)

def remove_spec_chars(text):
    for i in range(len(text)):
        text[i] = remove_puncs_and_nums(remove_escape_seqs(text[i]))
    return text

def remove_two_letter_words(text):
    for i in range(len(text)):
        text[i] = ' '.join(word for word in text[i].split() if len(word) > 2)
    return text

def remove_stopwords_corpus(text):
    return text.apply(lambda x: remove_stopwords(x)[2:-1])

def generate_corpus(text):
    text = to_lowercase(text)
    text = remove_spec_chars(text)
    text = remove_two_letter_words(text)
    return [stem_text(row)[2:-1] for row in text]

def generate_train_dtm(text):
    cv = CountVectorizer()
    return cv.fit_transform(text), cv

def generate_test_dtm(text, cv):
    return cv.transform(text)

def merge_text_and_cols(text_csr, otr_cols_csr):
    return sparse.hstack((otr_cols_csr, text_csr))

In [3]:
TRAIN_FILE = "/kaggle/input/traintestpkls-codeproject/train_pickle.pkl"

train_df = pd.read_pickle(TRAIN_FILE)
train_text = train_df['PostText']
train_text_csr, cv = generate_train_dtm(generate_corpus(train_text))

train_otr_cols = train_df[['Karma', 'NumAnswers', 'accountAge']]
train_otr_cols_csr = csr_matrix(train_otr_cols.values)

train_csr = merge_text_and_cols(train_text_csr, train_otr_cols_csr)

sparse.save_npz("train.npz", train_csr)

In [4]:
TEST_FILE = "/kaggle/input/traintestpkls-codeproject/test_pickle.pkl"
test_df = pd.read_pickle(TEST_FILE)
test_text = test_df['PostText']
test_text_csr = generate_test_dtm(generate_corpus(test_text), cv)

test_otr_cols = test_df[['Karma', 'NumAnswers', 'accountAge']]
test_otr_cols_csr = csr_matrix(test_otr_cols.values)

test_csr = merge_text_and_cols(test_text_csr, test_otr_cols_csr)

sparse.save_npz("test.npz", test_csr)