In [None]:
# import packages here
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from collections import Counter
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
# import data here
# data are tsv files, so slight manipulation to read
'''
train = pd.read_csv('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/train.tsv',delimiter='\t')
test = pd.read_csv('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/test.tsv',delimiter='\t') # no response variable
'''

start_time = time.time()

train_raw = pd.read_csv('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/train.tsv',delimiter= '\t')
#train_raw = train_raw.iloc[0:10000,] # just a bit
# standardize price here because may as well
normalized_price = np.log1p(train_raw['price'].values)
mean_price_norm = np.mean(normalized_price)
std_price_norm = np.std(normalized_price) 
train_raw['price'] = (normalized_price - mean_price_norm)/std_price_norm 

end_time = time.time()
print('import data took ' + str(end_time - start_time) + " seconds.")

In [None]:
X = train.drop(['train_id','price'],axis=1)
y = train['price']

#X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.3) # use this in separate ipynb for modeling

# convert to tensorflow 
#train_tf = tf.estimator.inputs.pandas_input_fn(x = pd.DataFrame(train_X), y = pd.Series(train_y), shuffle = True)
#test_tf = tf.estimator.inputs.pandas_input_fn(x = pd.DataFrame(test), shuffle = True)

In [None]:
print(len(list(set(X['brand_name'])))) # 4810 unique brand names (and a lot of NaN)
print(X['brand_name'].isnull().sum()) # 632682 NaN out of 1.4mil

In [None]:
# look at price

print('mean: ' + str(np.mean(y)))
print('standard deviation: ' + str(np.std(y)))
plt.hist(y, bins = 300) # skewed right histogram
plt.axis([0, 200,0 ,550000])
plt.show()

log_y = np.log(y.values + 0.000000001) # prevent infinity
mean_log_y = np.mean(log_y)
std_log_y = np.std(log_y)
standardized_y = (log_y - mean_log_y) / std_log_y

print(standardized_y)
plt.hist(standardized_y,bins = 100)
plt.show()

In [None]:
print(log_y)
print(min(log_y))
print(max(log_y))

In [None]:
# investigate the words in train
# block ends with appending words into a list

pattern = r'''(?x)          # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
    '''
tokenizer = RegexpTokenizer(pattern)
prod_names = train['name']
list_of_words = []
counter = 0

for names in prod_names: # reads one name at a time
    tokenize_names = tokenizer.tokenize(names.lower()) # tokenize each name after making all lowercase
    for i in tokenize_names:
        if (len(i) > 2 ): #ignore words of length 2 or less
            list_of_words.append(i) # append all words to one list
    counter += 1
    if counter % 200000 == 0:
        perc = round(counter/len(prod_names)*100)
        print((str(perc) + '%  complete'))
print('100% complete')

In [None]:
# investigate word count list
#list_of_words_no_stopwords = list(filter(lambda x: x not in stopwords.words('english'),list_of_words)) # takes too long
top_words = Counter(list_of_words).most_common()

In [None]:
# visualize the top words 
x_ = []
y_ = []
for i in top_words:
    x_.append(i[0]) # names of word counts as list
    y_.append(i[1]) # count of word counts as list (ordered)
    
plt.plot(range(0,200),y_[:200]) # maybe just make 200 flags for now
plt.show()

#using top 200 words as flags for potential features

In [None]:
# copy train just in case
train_copy = train.copy()
train_nrows = train_copy.shape[0]

In [None]:
# add flags as features to train set
top200_words = x_[:200]
counter = 0

for topword in top200_words:
    new_col_vals = [] # new column that will be flags for each topword
    for rows in prod_names:
        if topword in rows.split(): # append 1 if topword is in product name
            new_col_vals.append(1)
        else: 
            new_col_vals.append(0)
    train_copy[topword] = pd.Series(new_col_vals) #train_copy should now have 200 more columns after this
    counter += 1
    if counter % 5 == 0:
        perc = counter*.5
        print((str(perc) + '%  complete'))

In [None]:
train_copy.info()

In [None]:
# Looking into 'category_name'

cat_names = train['category_name']
spaced_cat = []
all_cat_words = []
counter = 0

for categories in cat_names:
    split_cat = str(categories).split('/')
    spaced_cat.append(str(categories).replace('/', ' '))
    indiv_list_of_cat = [x.strip() for x in split_cat]
    all_cat_words.extend(indiv_list_of_cat)
    counter += 1
    if counter % 200000 == 0:
        perc = round(counter*100/len(cat_names))
        print((str(perc) + '%  complete'))
print('100% complete')    

In [None]:
count_categories = Counter(all_cat_words).most_common()
names_common_cat = []
count_common_cat = []

for i in count_categories:
    names_common_cat.append(i[0]) # names of word counts as list
    count_common_cat.append(i[1]) # count of word counts as list (ordered)

plt.plot(range(0,200),count_common_cat[:200]) # maybe use 25 top categories (or even 5)
plt.show()

print(count_categories)

In [None]:
# include the category flags here (or maybe separate the files here)

top25_categories = names_common_cat[:25]
counter = 0

for topcat in top25_categories:
    new_col_vals = [] # new column that will be flags for each topword
    for rows in cat_names:
        if topcat in str(rows).split(): # append 1 if topword is in product name
            new_col_vals.append(1)
        else: 
            new_col_vals.append(0)
    train_copy[topcat] = pd.Series(new_col_vals) #train_copy should now have 200 more columns after this
    counter += 1
    if counter % 3 == 0:
        perc = counter/25*100
        print((str(perc) + '%  complete'))

In [None]:
# bag of words on category_name

array_categories = np.array(spaced_cat)
count = CountVectorizer()
bag = count.fit_transform(array_categories)

tfidf = TfidfTransformer()
tfidf.fit_transform()

In [None]:
# test some tfidf prediction here
X = train.drop(['train_id','price'],axis=1)
y = train['price']
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size = 0.5) # use this in separate ipynb for modeling

In [None]:
# Looking into 'category_name'

cat_names = X_train['category_name']
spaced_cat = []
counter = 0

for categories in cat_names:
    spaced_cat.append(str(categories).replace('/', ' '))
    counter += 1
    if counter % 200000 == 0:
        perc = round(counter*100/len(cat_names))
        print((str(perc) + '%  complete'))
print('100% complete')    

In [None]:
# Looking into 'category_name'

cat_names_val = X_val['category_name']
spaced_cat_val = []
counter = 0

for categories in cat_names_val:
    spaced_cat_val.append(str(categories).replace('/', ' '))
    counter += 1
    if counter % 200000 == 0:
        perc = round(counter*100/len(cat_names))
        print((str(perc) + '%  complete'))
print('100% complete')    

In [None]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False,preprocessor=None)
array_categories = np.array(spaced_cat)
bag = tfidf.fit_transform(array_categories)

In [None]:
from sklearn.linear_model import LinearRegression


In [None]:
linreg = LinearRegression()
linreg.fit(bag,y_train)

In [None]:
bag_val = tfidf.transform(np.array(spaced_cat_val))


In [None]:
predicted_price = linreg.predict(bag_val)
predicted_price.dtype

In [None]:
linreg.score(bag_val,y_val)