initial commit

bonn0062 · Feb 17, 2019 · 6063f5f · 6063f5f
1 parent 028dad5
commit 6063f5f
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 0 deletions.
diff --git a/data_cleaning_and_preprocessing.ipynb b/data_cleaning_and_preprocessing.ipynb
@@ -0,0 +1,90 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Basic data cleaning and preprocessing\n",
+    "\n",
+    "Here we'll use Numpy, Pandas, and Scikit-Learn to do some necessary basic cleaning and preprocessing of our data  so that we can use it in a machine learning model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import the libraries\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "\n",
+    "# Import the dataset\n",
+    "dataset = pd.read_csv('my_data.csv')\n",
+    "X = dataset.iloc[:, :-1].values\n",
+    "y = dataset.iloc[:, 3].values\n",
+    "\n",
+    "\n",
+    "# Take care of missing data\n",
+    "from sklearn.preprocessing import Imputer\n",
+    "imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)\n",
+    "imputer = imputer.fit(X[:, 1:3])\n",
+    "X[:, 1:3] = imputer.transform(X[:, 1:3])\n",
+    "\n",
+    "\n",
+    "# Encode categorical data\n",
+    "# Encode the independent variable\n",
+    "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
+    "labelencoder_X = LabelEncoder()\n",
+    "X[:, 0] = labelencoder_X.fit_transform(X[:, 0])\n",
+    "onehotencoder = OneHotEncoder(categorical_features = [0])\n",
+    "X = onehotencoder.fit_transform(X).toarray()\n",
+    "# Encode the dependent variable\n",
+    "labelencoder_y = LabelEncoder()\n",
+    "y = labelencoder_y.fit_transform(y)\n",
+    "\n",
+    "\n",
+    "# Splitting the dataset into the Training set and Test set\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)\n",
+    "\n",
+    "# Feature Scaling\n",
+    "'''from sklearn.preprocessing import StandardScaler\n",
+    "sc_X = StandardScaler()\n",
+    "X_train = sc_X.fit_transform(X_train)\n",
+    "X_test = sc_X.transform(X_test)\n",
+    "sc_y = StandardScaler()\n",
+    "y_train = sc_y.fit_transform(y_train)'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/my_data.csv b/my_data.csv
@@ -0,0 +1,11 @@
+Animal,Age,Worth,Friendly
+Cat,4,72000,No
+Dog,17,48000,Yes
+Moose,6,54000,No
+Dog,8,61000,No
+Moose,4,,Yes
+Cat,15,58000,Yes
+Dog,,52000,No
+Cat,12,79000,Yes
+Moose,5,83000,No
+Cat,7,67000,Yes