In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [2]:
train = pd.read_csv('Data/train.csv').fillna(' ')
test = pd.read_csv('Data/test.csv').fillna(' ')
print(train.shape)
print(test.shape)

(159571, 8)
(153164, 2)


In [5]:
def load_train_data(train_file_path, valid_rate=0.1, is_df=True):
    data_frame = pd.read_csv(train_file_path).sample(frac=1, random_state=11)
    np.random.seed(11)
    mask = np.random.rand(len(data_frame)) < 1 - valid_rate
    train_df, valid_df = data_frame.iloc[mask, :], data_frame.iloc[~mask, :]
    if is_df:
        return train_df, valid_df

    # train_ids, train_texts, train_labels (6 columns), valid_ids, valid_texts, valid_labels (6 columns)
    return train_df['id'].tolist(), train_df['comment_text'].tolist(), train_df.iloc[:, 2:].as_matrix(),\
           valid_df['id'].tolist(), valid_df['comment_text'].tolist(), valid_df.iloc[:, 2:].as_matrix()


def load_test_data(test_file_path, test_label_file_path, is_df=True):
    data_frame = pd.read_csv(test_file_path)
    label_frame = pd.read_csv(test_label_file_path)
    targets = label_frame['toxic'] != -1
    data_frame, label_frame = data_frame[targets], label_frame[targets]
    if is_df:
        return pd.concat([data_frame, label_frame.iloc[:, 1:]], 1)

    # test_ids, test_texts, test_labels (6 columns)
    return data_frame['id'].tolist(), data_frame['comment_text'].tolist(), data_frame.iloc[:, 1:].as_matrix()

In [10]:
tdf, vdf = load_train_data('Data/train.csv')
print('This is vdf?')
print(vdf)

This is vdf?
                      id                                       comment_text  \
56041   95bf7672a11f2799  Risk factors\n\nThe role of chlamydia should b...   
147196  3888c875d6a6a680  "\nHe isnt listed under ""Other"" though, last...   
52015   8b39ad2e8b8ab63f  So can we use special needs instead of saying ...   
85372   e45b67ab32b58e9d  Right, basically, if something is confusing or...   
78183   d14461a0cb85bf6f  being made by removing well-sourced material f...   
57665   9a4c3846ed8698cc  "\n\n Unreliable Sources. \n\nI have removed t...   
24208   3fedcda6d45515e4   Mmmm thanks for the informative explanation.Jase   
30524   510794dcd4ba193c  I'm tired of the shenanigans. STOP REMOVING TH...   
41637   6f0fc67520097a9d  Nevermind, apparently he made an edit to preve...   
78141   d12c5deb1da16377  I think that's an understatement; I think it d...   
24997   421894cef334be05  "\n\nProposed addition of two See Also and two...   
53848   8fd6da89a3e50586  I think that 