# SOURCE CODE FOR DATA SCIENCE

Import the libraries for data science projects:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer, Binarizer
from sklearn.metrics import mean_absolute_error, adjusted_rand_score, homogeneity_score, v_measure_score
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

Name your project:

In [None]:
print("\t[TITLE]\n")

## DATA ANALYSIS

Read the csv document that contains the dataset:

In [None]:
df = pd.read_csv(".csv")

Obtain a detailed info:

In [None]:
print(df.info(),'\n')

Get the name of the columns:

In [None]:
print(df.columns,'\n')

Show if there are missing values in the dataset:

In [None]:
missing = df.isnull().sum()
print('Missing Values ->\n')
print(missing[:],'\n')

Get the percentage representing the missing values:

In [None]:
cells = np.product(df.shape)
missing = missing.sum()
percent = (missing/cells) * 100
print(f"The percentage of missing values is {percent}%.\n")

Show the first and last rows to make sure you got the right data:

In [None]:
print(f'This dataset contains (rows, colums) before cleaning: {df.shape}.\n')
print(df.head(25),'\n')
print(df.tail(25),'\n')
print(df.describe(),'\n')

Fill empty spaces with a 0:

In [None]:
df = df.fillna(0)

Show the first and last rows after cleaning and apply a basic data analysis:

In [None]:
print(f'\nThis dataset contains (rows, colums) after cleaning: {df.shape}.\n')
print(df.head(25),'\n')
print(df.tail(25),'\n')
print(df.describe(),'\n')

## DATA VISUALIZATION

##### Make graphs from the previous dataframe:

Linear plot:

In [None]:
df.plot(title='')
plt.xlabel('')
plt.ylabel('')
plt.show()

Bar plot:

In [None]:
df.plot.bar(title='',subplots=True)
plt.xlabel('')
plt.ylabel('')
plt.show()

Scatter plot:

In [None]:
df.plot.scatter(x=2,y=3,title='',subplots=True)
df.plot(title='')
plt.xlabel('')
plt.ylabel('')
plt.show()

## MACHINE LEARNING

Training and testing data:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

##### Create your model:

Supervised Learning Estimators:

1) Linear Regression

In [None]:
lr = LinearRegression(normalize=True)

2) Support Vector Machines (SVM)

In [None]:
svc = SVC(kernel='Linear')

3) Naive Bayes

In [None]:
gnb = GaussianNB

4) KNN

In [None]:
knn = neigbors.KNeighborsClassifier(n_neighbors=5)

Unsupervised Learning Estimators:

1) Principal Component Analysis (PCA)

In [None]:
pca = PCA(n_components=0.95)

2) K Means

In [None]:
k_means = KMeans(n_clusters=3,random_state=0)

##### Fit the model to the data:

Supervised Learning:

In [None]:
lr.fit(X, y)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

Unsupervised Learning:

In [None]:
k_means.fit(X_train)
pca_model = pca.fit_transform(X_train)

##### Make predictions with the data:

Supervised Estimators:

In [None]:
y_prediction = svc.predict(np.random.random((2,5)))
y_prediction = lr.predict(X_test)
y_prediction = knn.predict_proba(X_test)

Unsupervised Estimators:

In [None]:
y_prediction = k_means.predict(X_test)

Doing standarization to data:

In [None]:
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)

Doing normalization to data:

In [None]:
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)

Doing binarization to data:

In [None]:
binarizer = Binarizer(threshold = 0.0).fit(X)
binary_X = binarizer.transform(X)

Calculating the mean absolute error (mae):

In [None]:
y_true = []
mean_absolute_error(y_true, y_prediction)

Calculating the adjusted rand index:

In [None]:
adjusted_rand_score(y_true, y_prediction)

Calculating the homogeneity:

In [None]:
homogeneity_score(y_true, y_prediction)

Calculating the V-measure:

In [None]:
metric.v_measure_score(y_true, y_prediction)

Do the cross-validation of your model:

In [None]:
print(cross_val_score(knn, X_train, y_train, cv=4),'\n')
print(cross_val_score(lr, X, y, cv=2))