In [1]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low
        
# This function submits an assignment.  You can submit an assignment as much as you like, only the final
# submission counts.  The paramaters are as follows:
# data - Pandas dataframe output.
# key - Your student key that was emailed to you.
# no - The assignment class number, should be 1 through 1.
# source_file - The full path to your Python or IPYNB file.  This must have "_class1" as part of its name.  
# .             The number must match your assignment number.  For example "_class2" for class assignment #2.
def submit(data,key,no,source_file=None):
    if source_file is None and '__file__' not in globals(): raise Exception('Must specify a filename when a Jupyter notebook.')
    if source_file is None: source_file = __file__
    suffix = '_class{}'.format(no)
    if suffix not in source_file: raise Exception('{} must be part of the filename.'.format(suffix))
    with open(source_file, "rb") as image_file:
        encoded_python = base64.b64encode(image_file.read()).decode('ascii')
    ext = os.path.splitext(source_file)[-1].lower()
    if ext not in ['.ipynb','.py']: raise Exception("Source file is {} must be .py or .ipynb".format(ext))
    r = requests.post("https://api.heatonresearch.com/assignment-submit",
        headers={'x-api-key':key}, json={'csv':base64.b64encode(data.to_csv(index=False).encode('ascii')).decode("ascii"),
        'assignment': no, 'ext':ext, 'py':encoded_python})
    if r.status_code == 200:
        print("Success: {}".format(r.text))
    else: print("Failure: {}".format(r.text))

In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [3]:


data_train=pd.read_csv("/Users/apple/study/DNN/kaggle/in-class competition/all/train.csv")
data_train[1:10]

Unnamed: 0,id,shape,metal,metal_cost,height,width,length,led,gears,motors,led_vol,motor_vol,gear_vol,volume_parts,cost,weight
1,100001,sphere,bronze,0.05,0,0,7,45,13,7,?,?,?,?,75.0,1008
2,100002,box,bronze,0.05,8,9,8,10,49,15,?,?,?,?,288.0,4928
3,100003,cylinder,platinum,29.44,7,8,0,15,45,18,0.40499999999999997,?,?,?,?,3286
4,100004,sphere,bronze,0.05,0,0,7,5,37,9,?,?,?,?,63.0,698
5,100005,sphere,gold,39.1,0,0,3,16,44,1,?,?,?,?,10579.0,269
6,100006,box,silver,0.47,9,5,7,26,12,1,?,?,?,?,2164.0,4575
7,100007,box,tin,0.06,9,8,5,10,49,11,0.27,?,?,?,167.0,2164
8,100008,box,bronze,0.05,8,9,2,58,49,19,?,?,?,?,?,122
9,100009,sphere,platinum,29.44,0,0,7,84,39,12,?,?,?,?,?,1238


In [12]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879004 entries, 0 to 879003
Data columns (total 16 columns):
id              879004 non-null int64
shape           879004 non-null object
metal           879004 non-null object
metal_cost      879004 non-null float64
height          879004 non-null int64
width           879004 non-null int64
length          879004 non-null int64
led             879004 non-null int64
gears           879004 non-null int64
motors          879004 non-null int64
led_vol         879004 non-null object
motor_vol       879004 non-null object
gear_vol        879004 non-null object
volume_parts    879004 non-null object
cost            879004 non-null object
weight          879004 non-null int64
dtypes: float64(1), int64(8), object(7)
memory usage: 107.3+ MB


In [13]:
data_train['metal'].describe()

count       879004
unique           5
top       platinum
freq        176182
Name: metal, dtype: object

In [6]:
data_train.drop('id', axis=1, inplace=True)

In [7]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879004 entries, 0 to 879003
Data columns (total 15 columns):
shape           879004 non-null object
metal           879004 non-null object
metal_cost      879004 non-null float64
height          879004 non-null int64
width           879004 non-null int64
length          879004 non-null int64
led             879004 non-null int64
gears           879004 non-null int64
motors          879004 non-null int64
led_vol         879004 non-null object
motor_vol       879004 non-null object
gear_vol        879004 non-null object
volume_parts    879004 non-null object
cost            879004 non-null object
weight          879004 non-null int64
dtypes: float64(1), int64(7), object(7)
memory usage: 100.6+ MB


In [8]:
df_sphere=data_train[data_train['shape']=="sphere"]
df_sphere.loc[:,'volume']=((df_sphere.length/2)**3)*3.14159*4/3
data_train.loc[(data_train['shape']=="sphere"),'volume']=df_sphere.volume
df_cylinder=data_train[data_train['shape']=="cylinder"]
df_cylinder.loc[:,'volume']=3.14159*df_cylinder.height*((df_cylinder.width/2)**2)
data_train.loc[(data_train['shape']=="cylinder"),'volume']=df_cylinder.volume
df_box=data_train[data_train['shape']=="box"]
df_box.loc[:,'volume']=df_box.height*df_box.width*df_box.length
data_train.loc[(data_train['shape']=="box"),'volume']=df_box.volume
data_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,shape,metal,metal_cost,height,width,length,led,gears,motors,led_vol,motor_vol,gear_vol,volume_parts,cost,weight,volume
0,cylinder,platinum,29.44,9,7,0,47,45,11,?,?,?,?,187061.0,6352,346.360298
1,sphere,bronze,0.05,0,0,7,45,13,7,?,?,?,?,75.0,1008,179.594228
2,box,bronze,0.05,8,9,8,10,49,15,?,?,?,?,288.0,4928,576.000000
3,cylinder,platinum,29.44,7,8,0,15,45,18,0.40499999999999997,?,?,?,?,3286,351.858080
4,sphere,bronze,0.05,0,0,7,5,37,9,?,?,?,?,63.0,698,179.594228
5,sphere,gold,39.10,0,0,3,16,44,1,?,?,?,?,10579.0,269,14.137155
6,box,silver,0.47,9,5,7,26,12,1,?,?,?,?,2164.0,4575,315.000000
7,box,tin,0.06,9,8,5,10,49,11,0.27,?,?,?,167.0,2164,360.000000
8,box,bronze,0.05,8,9,2,58,49,19,?,?,?,?,?,122,144.000000
9,sphere,platinum,29.44,0,0,7,84,39,12,?,?,?,?,?,1238,179.594228


In [9]:
dummies_shape=pd.get_dummies(data_train['shape'],prefix='shape')
dummies_metal=pd.get_dummies(data_train['metal'],prefix='metal')
df=pd.concat([data_train,dummies_shape,dummies_metal],axis=1)
df.drop(['shape','metal','motor_vol','gear_vol','volume_parts'],axis=1,inplace=True)

In [10]:
df[1:10]

Unnamed: 0,metal_cost,height,width,length,led,gears,motors,led_vol,cost,weight,volume,shape_box,shape_cylinder,shape_sphere,metal_bronze,metal_gold,metal_platinum,metal_silver,metal_tin
1,0.05,0,0,7,45,13,7,?,75.0,1008,179.594228,0,0,1,1,0,0,0,0
2,0.05,8,9,8,10,49,15,?,288.0,4928,576.0,1,0,0,1,0,0,0,0
3,29.44,7,8,0,15,45,18,0.40499999999999997,?,3286,351.85808,0,1,0,0,0,1,0,0
4,0.05,0,0,7,5,37,9,?,63.0,698,179.594228,0,0,1,1,0,0,0,0
5,39.1,0,0,3,16,44,1,?,10579.0,269,14.137155,0,0,1,0,1,0,0,0
6,0.47,9,5,7,26,12,1,?,2164.0,4575,315.0,1,0,0,0,0,0,1,0
7,0.06,9,8,5,10,49,11,0.27,167.0,2164,360.0,1,0,0,0,0,0,0,1
8,0.05,8,9,2,58,49,19,?,?,122,144.0,1,0,0,1,0,0,0,0
9,29.44,0,0,7,84,39,12,?,?,1238,179.594228,0,0,1,0,0,1,0,0


In [11]:
#multiplier=df.loc[(df['led_vol']==0.27),:].led./df.loc[(df['led_vol']=="0.27"),:].led_vol
multiplier1=float(df.loc[(df['led_vol']=="0.27"),:].led_vol.unique())
multiplier2=float(df.loc[(df['led_vol']=="0.27"),:].led.unique())
#print(multiplier1)
#print(multiplier2)
multiplier=multiplier1/multiplier2
print(multiplier)



0.027000000000000003


In [12]:
df_quest=df.loc[(df['led_vol']=="?"),:]
df_quest['led_vol']=df_quest.led*multiplier
df.loc[(df['led_vol']=="?"),'led_vol']=df_quest['led_vol']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
df[1:10]


Unnamed: 0,metal_cost,height,width,length,led,gears,motors,led_vol,cost,weight,volume,shape_box,shape_cylinder,shape_sphere,metal_bronze,metal_gold,metal_platinum,metal_silver,metal_tin
1,0.05,0,0,7,45,13,7,1.215,75.0,1008,179.594228,0,0,1,1,0,0,0,0
2,0.05,8,9,8,10,49,15,0.27,288.0,4928,576.0,1,0,0,1,0,0,0,0
3,29.44,7,8,0,15,45,18,0.4049999999999999,?,3286,351.85808,0,1,0,0,0,1,0,0
4,0.05,0,0,7,5,37,9,0.135,63.0,698,179.594228,0,0,1,1,0,0,0,0
5,39.1,0,0,3,16,44,1,0.432,10579.0,269,14.137155,0,0,1,0,1,0,0,0
6,0.47,9,5,7,26,12,1,0.702,2164.0,4575,315.0,1,0,0,0,0,0,1,0
7,0.06,9,8,5,10,49,11,0.27,167.0,2164,360.0,1,0,0,0,0,0,0,1
8,0.05,8,9,2,58,49,19,1.566,?,122,144.0,1,0,0,1,0,0,0,0
9,29.44,0,0,7,84,39,12,2.268,?,1238,179.594228,0,0,1,0,0,1,0,0


In [14]:
from sklearn.ensemble import RandomForestRegressor
def set_missing_cost(df):
    cost_df=df[['cost','metal_cost','volume','shape_box','shape_cylinder','shape_sphere','led','gears','motors']]
    
    known_cost=cost_df[cost_df.cost!="?"].as_matrix()
    unknown_cost=cost_df[cost_df.cost=="?"].as_matrix()
    
    y=known_cost[:,0]
    X=known_cost[:,1:]
    
    rfr=RandomForestRegressor(random_state=0,n_estimators=1000,n_jobs=-1)
    rfr.fit(X,y)
    predictedCost=rfr.predict(unknown_cost[:,1::])
    df.loc[(df.cost=="?"),'cost']=predictedCost
    return df,rfr



In [15]:
df,rfr=set_missing_cost(df)

  """
  


In [18]:
df
df.to_csv("/Users/apple/study/DNN/kaggle/in-class competition/set_missing_cost.csv", index=False)

In [3]:
df=pd.read_csv("/Users/apple/study/DNN/kaggle/in-class competition/set_missing_cost.csv")

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879004 entries, 0 to 879003
Data columns (total 19 columns):
metal_cost        879004 non-null float64
height            879004 non-null int64
width             879004 non-null int64
length            879004 non-null int64
led               879004 non-null int64
gears             879004 non-null int64
motors            879004 non-null int64
led_vol           879004 non-null float64
cost              879004 non-null float64
weight            879004 non-null int64
volume            879004 non-null float64
shape_box         879004 non-null int64
shape_cylinder    879004 non-null int64
shape_sphere      879004 non-null int64
metal_bronze      879004 non-null int64
metal_gold        879004 non-null int64
metal_platinum    879004 non-null int64
metal_silver      879004 non-null int64
metal_tin         879004 non-null int64
dtypes: float64(4), int64(15)
memory usage: 127.4 MB


In [5]:
df['cost'] = df['cost'].astype(float)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879004 entries, 0 to 879003
Data columns (total 19 columns):
metal_cost        879004 non-null float64
height            879004 non-null int64
width             879004 non-null int64
length            879004 non-null int64
led               879004 non-null int64
gears             879004 non-null int64
motors            879004 non-null int64
led_vol           879004 non-null float64
cost              879004 non-null float64
weight            879004 non-null int64
volume            879004 non-null float64
shape_box         879004 non-null int64
shape_cylinder    879004 non-null int64
shape_sphere      879004 non-null int64
metal_bronze      879004 non-null int64
metal_gold        879004 non-null int64
metal_platinum    879004 non-null int64
metal_silver      879004 non-null int64
metal_tin         879004 non-null int64
dtypes: float64(4), int64(15)
memory usage: 127.4 MB


In [7]:

train_df=df[['weight','led','gears','motors','cost','volume','shape_box','shape_cylinder','shape_sphere','metal_bronze','metal_gold','metal_platinum','metal_silver','metal_tin']]

In [8]:
train_df[1:5]
train_df.to_csv("/Users/apple/study/DNN/kaggle/in-class competition/iris_weight.csv", index=False)


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879004 entries, 0 to 879003
Data columns (total 14 columns):
weight            879004 non-null int64
led               879004 non-null int64
gears             879004 non-null int64
motors            879004 non-null int64
cost              879004 non-null float64
volume            879004 non-null float64
shape_box         879004 non-null int64
shape_cylinder    879004 non-null int64
shape_sphere      879004 non-null int64
metal_bronze      879004 non-null int64
metal_gold        879004 non-null int64
metal_platinum    879004 non-null int64
metal_silver      879004 non-null int64
metal_tin         879004 non-null int64
dtypes: float64(2), int64(12)
memory usage: 93.9 MB


In [None]:
encode_numeric_zscore(train_df,'led')
encode_numeric_zscore(train_df,'gears')
encode_numeric_zscore(train_df,'motors')
encode_numeric_zscore(train_df,'cost')
encode_numeric_zscore(train_df,'volume')

# Create the x-side (feature vectors) of the training
x, y = to_xy(train_df,'weight')
    
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(    
    x, y, test_size=0.25, random_state=45)

model = Sequential()
model.add(Dense(20, input_dim=x.shape[1], activation='relu'))
model.add(Dense(10))
model.add(Dense(y.shape[1],activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
