This will show functions used during data science cases, when loading or creating data for our projects 

# Data Load 

the following functions and processes are used when, for doing data science, we use real data that we can either download or 
have stored in our local system

## Example if the data is in local server in csv format. 

In [3]:
import pandas as pd
datapath = 'C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\python\\winequality.csv' 
# place here the path of the file you are loading, if you have it on your local computer

winedata = pd.read_csv(datapath,sep = ';') # make sure to check the separator for your case

winedata.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,5.2,0.34,0.0,1.8,0.05,27.0,63.0,0.9916,3.68,0.79,14.0,6,red
1,6.2,0.55,0.45,12.0,0.049,27.0,186.0,0.9974,3.17,0.5,9.3,6,white
2,7.15,0.17,0.24,9.6,0.119,56.0,178.0,0.99578,3.15,0.44,10.2,6,white
3,6.7,0.64,0.23,2.1,0.08,11.0,119.0,0.99538,3.36,0.7,10.9,5,red
4,7.6,0.23,0.34,1.6,0.043,24.0,129.0,0.99305,3.12,0.7,10.4,5,white


## Example if the data is in a tarz file, somewhere else

### downloading the tarz file 

In [4]:
import os
import tarfile
import requests

os.getcwd() # we first check where we are, to there indicate where the file will be placed once downloaded

'C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\Python - scikit-learn datascience library'

In [5]:
HOUSING_URL = "http://www.dcc.fc.up.pt/~ltorgo/Regression/" # here we place the url where the file we want to extract is 

HOUSING_FILENAME = "cal_housing.tgz" # we put the name of the file we want to extract. 

HOUSING_PATH = 'C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\Python - scikit-learn datascience library' # where we want to place the file, inside of our
# computer or server. 

In [6]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH, housing_file=HOUSING_FILENAME):
    """
    This function allows us to obtain a certain .tar file, located in a certain url, save it in a local
    directory of our choice and extract it there. Arguments:
    
        - housing_url: Where the file we want, is located.
        - housing_path: Local path where we want to place said file.
        - housing_file: Name of the file.
"""

    # this first part of the function make sure that the housing_path we give does exists, if it does not, it creates a directory
    # to place the file in. 
    if not os.path.isdir(housing_path):
        print(housing_path,"does not exist, it will be created...")
        os.makedirs(housing_path)
        print(housing_path,"created!")
    
    # this creates a tgz_path joining the housing path and the file. 
    tgz_path = os.path.join(housing_path, housing_file)
    
    # Here we create an user agent and make a request to the url in question, creating a request object
    header = {'User-Agent': 'Mozilla/5.0'}
    print("requesting housing dataset at",housing_url)
    response = requests.get(housing_url + housing_file, headers=header)
    
   # We check if the request object response has been correctly processed or not, if it has, we start downloading the content
    if(response.status_code == 200):
        with open(tgz_path, 'wb') as handle:
            for block in response.iter_content(1024):
                handle.write(block)
            print("download complete!")
    else : 
        print(" request failed ")
    
    # Finally, we extract the file. 
    print("Untarring files...")
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    print("Extraction complete!")
    housing_tgz.close()    

In [7]:
fetch_housing_data() # with this, now we have the data downloaded in our local, so we can just create the dataframe like we did before

requesting housing dataset at http://www.dcc.fc.up.pt/~ltorgo/Regression/
download complete!
Untarring files...
Extraction complete!


### With the data loaded in our local computer, we now create a local data file that joins the different files that come in tarz

In [10]:
# First we check the files we have 

with open(HOUSING_UNTAR_PATH + "/cal_housing.domain") as header_file:
    print(header_file.read())

longitude: continuous.
latitude: continuous.
housingMedianAge: continuous. 
totalRooms: continuous. 
totalBedrooms: continuous. 
population: continuous. 
households: continuous. 
medianIncome: continuous. 
medianHouseValue: continuous. 



In [12]:
import numpy as np
n_lines = 5
with open(HOUSING_UNTAR_PATH + "/cal_housing.data") as data_file:
    for line in np.arange(n_lines):
        print(next(data_file))

-122.230000,37.880000,41.000000,880.000000,129.000000,322.000000,126.000000,8.325200,452600.000000

-122.220000,37.860000,21.000000,7099.000000,1106.000000,2401.000000,1138.000000,8.301400,358500.000000

-122.240000,37.850000,52.000000,1467.000000,190.000000,496.000000,177.000000,7.257400,352100.000000

-122.250000,37.850000,52.000000,1274.000000,235.000000,558.000000,219.000000,5.643100,341300.000000

-122.250000,37.850000,52.000000,1627.000000,280.000000,565.000000,259.000000,3.846200,342200.000000



In [8]:
# we create a path for the directory, inside our local computer where our data is placed
HOUSING_UNTAR_PATH = HOUSING_PATH + "/CaliforniaHousing"
#  In this case, data is separated in " domains" and the "actual data" we will combine them in a csv, first we will create the 
# destination file
HOUSING_TOTAL_FILENAME = "housing.csv"
# We create a complete route where we will place that destination file inside our directory. 
HOUSING_TOTAL_PATH = HOUSING_PATH + "/" + HOUSING_TOTAL_FILENAME

HOUSING_TOTAL_PATH

'C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\Python - scikit-learn datascience library/housing.csv'

In [13]:
def merge_housing_data(housing_untar_path=HOUSING_UNTAR_PATH, housing_total_path=HOUSING_TOTAL_PATH):
    import glob 
    
    """
    This function will analyze the directory where the tarz file was extracted, it will process the .domain file leaving only the feature names, and \
    it will place it in the csv destination file, with the content of the file containing the data
    

    Argumentos:
        - housing_untar_path: The directory where we have the files, after downloading tarz.
        - housing_total_path: The route of the destination file, where we will place the data .    
    """
    # glob.glob permitirá listar los ficheros contenidos en el directorio
    # sorted permitirá ordenar los ficheros obtenidos mediante glob.glob según el criterio
    # indicado, en este caso el tamaño (de menor a mayor tamaño).
    concat_files = sorted(glob.glob(housing_untar_path + "/*"), key=os.path.getsize)
    print("Merging files...",concat_files)
    with open(housing_total_path,"w") as outfile:
        for part_file in concat_files:
            with open(part_file, "r") as infile:
                # El fichero con los nombres de los campos será aplanado de manera que al final quede una
                # única línea: campo1, campo2, campo3, campo4...
                if(".domain" in part_file):
                    # Primero se leen todas sus líneas (el nombre de cada feature y su tipo)
                    raw_header = infile.readlines()
                    # Luego suprime el tipo (": continous") metiendo el resultado en una lista
                    header = [ field.replace(": continuous.","").strip() for field in raw_header]
                    # Une cada elemento de la lista por "," y vuelca el resultado en el fichero de salida
                    outfile.write(",".join(header)+"\n")
                else:
                    outfile.write(infile.read())   
    print("Files merged into",outfile.name)

In [14]:
merge_housing_data()



Merging files... ['C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\Python - scikit-learn datascience library/CaliforniaHousing\\cal_housing.domain', 'C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\Python - scikit-learn datascience library/CaliforniaHousing\\cal_housing.data']
Files merged into C:\Users\stefano\Dropbox\Aprendizaje util\Programacion learning\modulo machine learning\Python - scikit-learn datascience library/housing.csv


In [16]:
datapath = 'C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\Python - scikit-learn datascience library\\housing.csv'
import pandas as  pd
housingdata = pd.read_csv(datapath,sep = ',')

housingdata.head()  

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


# Data creation 

We can create gaussian samples, using make_blobs from scikit learn

In [17]:
# details here : https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html 
from sklearn.datasets import make_blobs

my_dataset = make_blobs(n_samples=10,     # Number of interactions or rows we create for the dataset 
                               n_features=2,     # Number of variables we are going to use
                               centers=3,        # number of gaussian centers the data distribution will have
                               cluster_std=1.5,  #typical deviation of the data 
                               random_state=2)   # If we want to specify a seed for the data 

my_dataset

(array([[  0.96455381,   0.46894968],
        [ -1.21779287, -11.15836353],
        [  1.80183704,  -2.1877917 ],
        [ -2.10087691,  -3.74757963],
        [ -0.45292089,  -6.04316334],
        [ -0.52577983, -11.34940749],
        [ -0.12855687,  -1.28001427],
        [ -1.20778828,  -4.87647215],
        [ -2.9098058 ,  -3.62795484],
        [ -2.86703029, -10.84498679]]), array([1, 0, 1, 2, 0, 0, 1, 2, 2, 0]))

In [19]:
# The make_blobs function returns two objects, one with the data , for wich we will create a dataframe, and another one for 
# the labels of the data that specify their center we will add as a colum to the dataframe or we can just ignore it. 

dataset_blobs = pd.DataFrame(my_dataset[0])  

dataset_blobs.columns = ["x", "y"]

dataset_blobs["label"] = my_dataset[1]  

dataset_blobs

Unnamed: 0,x,y,label
0,0.964554,0.46895,1
1,-1.217793,-11.158364,0
2,1.801837,-2.187792,1
3,-2.100877,-3.74758,2
4,-0.452921,-6.043163,0
5,-0.52578,-11.349407,0
6,-0.128557,-1.280014,1
7,-1.207788,-4.876472,2
8,-2.909806,-3.627955,2
9,-2.86703,-10.844987,0


# Using scikit learn generated data

## Creating data for a regression excercise

In [26]:
from sklearn.datasets import load_boston

datos_boston = load_boston()

datos_boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [24]:

dataset_boston = pd.DataFrame(datos_boston["data"])
# If we open the datos_boston, we see we have different objects,  , we will create the feature names as colums, we will add the data
# as the rows of those colums, and add the target , creating a complete dataframe to work with. 

dataset_boston.columns = datos_boston["feature_names"]
dataset_boston["target"] = datos_boston["target"]

In [27]:
dataset_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


## Creating data for a classification excercise

In [28]:
from sklearn.datasets import make_classification

dataset = make_classification(n_samples=600, # Number of rows or interactions
                              n_features=2, # Number of features
                              n_redundant=0, # Number of redundant features
                              n_classes=2, # Number of classes for the target variable
                              random_state=749 # Seed generator
                             )

In [29]:
# We first create a dataframe for the features

import pandas as pd 

dataset_clasificacion = pd.DataFrame(dataset[0])

# then the labels
dataset_clasificacion["label"] = dataset[1]

dataset_clasificacion[:5]

Unnamed: 0,0,1,label
0,0.574208,1.609926,1
1,-0.666932,-0.698694,0
2,0.452082,-2.546917,1
3,-0.854633,-0.364912,0
4,-0.277625,-0.41126,0
