diff --git a/data_cleaning_and_preprocessing.ipynb b/data_cleaning_and_preprocessing.ipynb new file mode 100644 index 0000000..70762d7 --- /dev/null +++ b/data_cleaning_and_preprocessing.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Basic data cleaning and preprocessing\n", + "\n", + "Here we'll use Numpy, Pandas, and Scikit-Learn to do some necessary basic cleaning and preprocessing of our data so that we can use it in a machine learning model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the libraries\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "\n", + "# Import the dataset\n", + "dataset = pd.read_csv('my_data.csv')\n", + "X = dataset.iloc[:, :-1].values\n", + "y = dataset.iloc[:, 3].values\n", + "\n", + "\n", + "# Take care of missing data\n", + "from sklearn.preprocessing import Imputer\n", + "imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)\n", + "imputer = imputer.fit(X[:, 1:3])\n", + "X[:, 1:3] = imputer.transform(X[:, 1:3])\n", + "\n", + "\n", + "# Encode categorical data\n", + "# Encode the independent variable\n", + "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n", + "labelencoder_X = LabelEncoder()\n", + "X[:, 0] = labelencoder_X.fit_transform(X[:, 0])\n", + "onehotencoder = OneHotEncoder(categorical_features = [0])\n", + "X = onehotencoder.fit_transform(X).toarray()\n", + "# Encode the dependent variable\n", + "labelencoder_y = LabelEncoder()\n", + "y = labelencoder_y.fit_transform(y)\n", + "\n", + "\n", + "# Splitting the dataset into the Training set and Test set\n", + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)\n", + "\n", + "# Feature Scaling\n", + "'''from sklearn.preprocessing import StandardScaler\n", + "sc_X = StandardScaler()\n", + "X_train = sc_X.fit_transform(X_train)\n", + "X_test = sc_X.transform(X_test)\n", + "sc_y = StandardScaler()\n", + "y_train = sc_y.fit_transform(y_train)'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/my_data.csv b/my_data.csv new file mode 100644 index 0000000..165d36a --- /dev/null +++ b/my_data.csv @@ -0,0 +1,11 @@ +Animal,Age,Worth,Friendly +Cat,4,72000,No +Dog,17,48000,Yes +Moose,6,54000,No +Dog,8,61000,No +Moose,4,,Yes +Cat,15,58000,Yes +Dog,,52000,No +Cat,12,79000,Yes +Moose,5,83000,No +Cat,7,67000,Yes \ No newline at end of file