In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import os
from sys import platform

# Instructions

1. Load the `train.csv` file
2. Explore the data, understand it
3. Process it for future training
4. Do train, test, split for your `train.csv` file
5. `fit/train` a model from your cleaned_train_df
-----
5. Load the `train.csv` file
6. Apply the same processing you did to `train.csv` into `test.csv`
7. `predict` the price for that file
8. Only keep the columns you need
9. Export
-----
10. Repeat! 🚀🔥

# Import the csv files

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
print(df_train.shape[0])
df_train.sample()

40455


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
8489,8489,0.44,Ideal,D,VS1,62.3,55.0,4.88,4.84,3.03,7.165


In [4]:
print(df_test.shape[0])
df_test.sample()

13485


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
12390,12390,0.51,Ideal,D,VS1,62.3,56.0,5.09,5.12,3.18


# Cleaning, processing, feature selection, etc

In [5]:
# Processing is necessary, otherwise we won't be able to fit a model
# For the sake of the example, we'll just drop categorical columns

df_train_cleaned = df_train.select_dtypes(exclude='object')
print(df_train_cleaned.shape[0])
df_train_cleaned

40455


Unnamed: 0,id,carat,depth,table,x,y,z,price
0,0,0.30,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,62.7,56.0,6.42,6.46,4.04,9.183
2,2,0.72,61.8,59.0,5.71,5.74,3.54,7.983
3,3,1.08,63.2,57.0,6.54,6.50,4.12,8.371
4,4,0.36,62.3,59.0,4.50,4.55,2.82,6.588
...,...,...,...,...,...,...,...,...
40450,40450,0.42,62.1,59.0,4.78,4.82,2.98,6.551
40451,40451,0.53,62.0,58.0,5.21,5.18,3.22,7.382
40452,40452,0.80,62.8,58.0,5.86,5.90,3.69,7.768
40453,40453,1.01,61.5,57.0,6.40,6.48,3.96,8.726


# Train on train.csv

![](https://builtin.com/sites/www.builtin.com/files/styles/ckeditor_optimize/public/inline-images/4_train-test-split.jpg)

## Train, test split

In [21]:
X = df_train_cleaned.iloc[:,:-1]
y = df_train_cleaned['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.)

## Fit

In [22]:
"""
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

# Just for feedback
if platform == "darwin":
    os.system("say -v Monica ayam don treinin")
"""

'\nregressor = SVR(kernel = \'rbf\')\nregressor.fit(X_train, y_train)\n\n#\xa0Just for feedback\nif platform == "darwin":\n    os.system("say -v Monica ayam don treinin")\n'

### Fit Random Forest

In [23]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)


### Linear regression

In [32]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Naive Bayes

In [40]:
regressor=GaussianNB()
regressor.fit(X_train, y_train)

ValueError: Unknown label type: (array([5.814, 5.817, 5.82 , ..., 9.839, 9.841, 9.842]),)

# Applying same cleaning & processing to my `test.csv`

In [33]:
df_test_cleaned = df_test.select_dtypes(exclude='object')
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,id,carat,depth,table,x,y,z
10658,10658,1.16,61.5,54.0,6.78,6.75,4.16


# Predict on the `test.csv`

In [34]:
y_pred = regressor.predict(df_test_cleaned)
y_pred

# Just for feedback
if platform == "darwin":
    os.system("say -v Monica ayam don predictin")



# DF with two columns

In [35]:
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,id,carat,depth,table,x,y,z
6316,6316,0.31,59.5,59.0,4.47,4.4,2.64


In [36]:
df_test_cleaned['price'] = y_pred # Adding the predicted price
df_for_submission = df_test_cleaned[["id", "price"]] # Modifying for subnmission

In [37]:
print(df_for_submission.shape[0])
df_for_submission.sample()

13485


Unnamed: 0,id,price
1523,1523,12.393119


# Export (index=False)

In [38]:
df_for_submission.to_csv("my_submission2.csv", index=False)

# Just for feedback
if platform == "darwin":
    os.system("say -v Monica redi for submission")