Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
bonn0062 committed Feb 17, 2019
1 parent 028dad5 commit 6063f5f
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 0 deletions.
90 changes: 90 additions & 0 deletions data_cleaning_and_preprocessing.ipynb
@@ -0,0 +1,90 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Basic data cleaning and preprocessing\n",
"\n",
"Here we'll use Numpy, Pandas, and Scikit-Learn to do some necessary basic cleaning and preprocessing of our data so that we can use it in a machine learning model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Import the libraries\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"\n",
"# Import the dataset\n",
"dataset = pd.read_csv('my_data.csv')\n",
"X = dataset.iloc[:, :-1].values\n",
"y = dataset.iloc[:, 3].values\n",
"\n",
"\n",
"# Take care of missing data\n",
"from sklearn.preprocessing import Imputer\n",
"imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)\n",
"imputer = imputer.fit(X[:, 1:3])\n",
"X[:, 1:3] = imputer.transform(X[:, 1:3])\n",
"\n",
"\n",
"# Encode categorical data\n",
"# Encode the independent variable\n",
"from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
"labelencoder_X = LabelEncoder()\n",
"X[:, 0] = labelencoder_X.fit_transform(X[:, 0])\n",
"onehotencoder = OneHotEncoder(categorical_features = [0])\n",
"X = onehotencoder.fit_transform(X).toarray()\n",
"# Encode the dependent variable\n",
"labelencoder_y = LabelEncoder()\n",
"y = labelencoder_y.fit_transform(y)\n",
"\n",
"\n",
"# Splitting the dataset into the Training set and Test set\n",
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)\n",
"\n",
"# Feature Scaling\n",
"'''from sklearn.preprocessing import StandardScaler\n",
"sc_X = StandardScaler()\n",
"X_train = sc_X.fit_transform(X_train)\n",
"X_test = sc_X.transform(X_test)\n",
"sc_y = StandardScaler()\n",
"y_train = sc_y.fit_transform(y_train)'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
11 changes: 11 additions & 0 deletions my_data.csv
@@ -0,0 +1,11 @@
Animal,Age,Worth,Friendly
Cat,4,72000,No
Dog,17,48000,Yes
Moose,6,54000,No
Dog,8,61000,No
Moose,4,,Yes
Cat,15,58000,Yes
Dog,,52000,No
Cat,12,79000,Yes
Moose,5,83000,No
Cat,7,67000,Yes

0 comments on commit 6063f5f

Please sign in to comment.