In [1]:
import pandas as pd
import scipy.sparse
import scipy
import numpy as np
from sklearn.decomposition import PCA
from sklearn.utils import resample
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.manifold import TSNE

In [2]:
# def getpath(relative_path):
#     from os import path
#     my_path = path.abspath(path.dirname(__file__))
#     return path.join(my_path, relative_path)


# train_clean_path = getpath('../Data/train_clean.tsv')
# sparse_matrix_path = getpath('../Data/train_term_matrix.dtx')

train_clean = pd.read_csv('Data/train_clean.tsv', sep='\t', header=0)

sparse_matrix = pd.read_csv('Data/train_term_matrix.mtx', header=None, 
                            skiprows = 1, delimiter=r"\s+")



In [3]:
#seperate out the rows, columns, entries information
format_sparse = (sparse_matrix[:1])
total_rows = format_sparse.iloc[0][0]
total_columns = format_sparse.iloc[0][1]
total_entries = format_sparse.iloc[0][2]

In [4]:
#drop the first row with the rows, columns, entries information
sparse_matrix.drop(sparse_matrix.index[0], inplace=True)

In [5]:
# subtract 1 from the indices because I am translating from R to Python indexing. R start at 1 Python starts at 0
sparse_rows = sparse_matrix.iloc[:,0].values - 1
sparse_columns = sparse_matrix.iloc[:,1].values - 1

In [6]:
# Values do not need to be changed at all
sparse_values = sparse_matrix.iloc[:,2].values

In [None]:
# create the sparse matrix
sparse_matrix = scipy.sparse.csr_matrix( (sparse_values,
                                          (sparse_rows,sparse_columns)), 
                                            shape=(total_rows, total_columns) )


In [None]:
%%time

# TDIF
tf_idf = TfidfTransformer()

sparse_matrix_tf_idf = tf_idf.fit_transform(sparse_matrix)

samples = 10000
state = 95

# Sample Data to reduce the size
sparse_matrix_sample = resample(sparse_matrix_tf_idf.todense(), 
                                n_samples=samples, random_state=state)



In [None]:
train_clean['price'] = train_clean['price'].astype(float)
print(train_clean.head())


In [None]:
mean_values = train_clean.groupby('category_name_1')['price'].mean()
std_values = train_clean.groupby('category_name_1')['price'].std()
count_values = train_clean.groupby('category_name_1')['price'].count()


summary_category = pd.concat([mean_values, std_values, count_values], axis=1)
summary_category.columns = ['Mean Price', 'St Dev Price', "Listings Count"]

print(round(summary_category , 2))

In [None]:
import seaborn as sns
print("note: removed outliers")
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
plt.title("Price by Category boxplot")
plt.xticks(rotation=90)
ax = sns.boxplot(x='category_name_1', y='price', data=train_clean, showfliers=False)

In [None]:
train_clean['price_bin'] = pd.cut(df['price'], bins = 4)

In [None]:
%%time

# Fit PCA with top 2 components on a subset of the data
pca = PCA(n_components=2)

descriptions_pca = pca.fit(sparse_matrix_sample.todense())


In [None]:
%%time
descriptions_pca = pca.transform(sparse_matrix_sample.todense())

In [None]:
%matplotlib inline

plt.scatter(descriptions_pca[:,0], descriptions_pca[:,1])
plt.show()

In [None]:
%%time
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
app_predict_tsne2d = tsne.fit_transform(sparse_matrix_sample.todense())

In [None]:
%%time
descriptions_pca_all_data = pca.transform(sparse_matrix_tf_idf.todense())



In [None]:
plt.scatter(descriptions_pca_all_data[:,0], descriptions_pca_all_data[:,1])
plt.show()