In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
raw_data_path = os.path.join(os.path.pardir,'data','raw')
train_file_path = os.path.join(raw_data_path, 'train.csv')
test_file_path = os.path.join(raw_data_path, 'test.csv')

In [3]:
train_df = pd.read_csv(train_file_path,index_col='Complaint-ID')
test_df = pd.read_csv(test_file_path,index_col='Complaint-ID')

In [4]:
test_df['Complaint-Status'] = -1
df = pd.concat((train_df, test_df), axis=0)
df = df.iloc[1:5000,:]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


In [5]:
df.head()
df['Consumer_complaint_summary'] = df['Consumer-complaint-summary']
df['Transaction_Type'] = df['Transaction-Type']

In [6]:
col = ['Transaction_Type', 'Consumer_complaint_summary']

In [7]:
df = df[col]
df = df[pd.notnull(df['Consumer_complaint_summary'])]

In [8]:
df.columns = ['Transaction_Type', 'Consumer_complaint_summary']

In [9]:
df['category_id'] = df['Transaction_Type'].factorize()[0]
category_id_df = df[['Transaction_Type', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Transaction_Type']].values)
df.head()

Unnamed: 0_level_0,Transaction_Type,Consumer_complaint_summary,category_id
Complaint-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tr-2,Credit reporting,XX / XX / XXXX La requête en faillite n ° XXXX...,0
Tr-3,Bank account or service,"El XXXX / XXXX / 15, estaba preparando el vuel...",1
Tr-4,Debt collection,"The loan was paid in XXXX XXXX. In XXXX, 4 yea...",2
Tr-5,Credit card,J'ai obtenu un compte de crédit de soins pour ...,3
Tr-6,Mortgage,The owner of my original mortgage filed for ba...,4


In [10]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
df.groupby('Transaction_Type').Consumer_complaint_summary.count().plot.bar(ylim=0)
plt.show()

<Figure size 800x600 with 1 Axes>

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.Consumer_complaint_summary).toarray()
labels = df.category_id
features.shape

(4999, 20802)

In [13]:
from sklearn.feature_selection import chi2
N = 2
for Transaction_Type, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(Transaction_Type))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'Bank account or service':
  . Most correlated unigrams:
. bank
. overdraft
  . Most correlated bigrams:
. debit card
. overdraft fees
# 'Checking or savings account':
  . Most correlated unigrams:
. devuelven
. provisional
  . Most correlated bigrams:
. demande que
. provisional credit
# 'Consumer Loan':
  . Most correlated unigrams:
. vehículo
. vehicle
  . Most correlated bigrams:
. ally financial
. santander consumer
# 'Credit card':
  . Most correlated unigrams:
. tarjeta
. card
  . Most correlated bigrams:
. tarjeta crédito
. credit card
# 'Credit card or prepaid card':
  . Most correlated unigrams:
. reward
. cards
  . Most correlated bigrams:
. citi credit
. paid minimum
# 'Credit reporting':
  . Most correlated unigrams:
. experian
. equifax
  . Most correlated bigrams:
. xxxx equifax
. credit report
# 'Credit reporting, credit repair services, or other personal consumer reports':
  . Most correlated unigrams:
. 2017
. equifax
  . Most correlated bigrams:
. did authorize
. x

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(df['Consumer_complaint_summary'], df['Transaction_Type'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [15]:
print(clf.predict(count_vect.transform(["This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."])))

['Debt collection']
