In [3]:
"""
    1. Take a peek at your raw data.
    2. Review the dimensions of your dataset.
    3. Review the data types of attributes in your data.
    4. Summarize the distribution of instances across classes in your dataset.
    5. Summarize your data using descriptive statistics.
    6. Understand the relationships in your data using correlations.
    7. Review the skew of the distributions of each attribute.
"""

import pandas as pd

In [4]:
"""
Importare un database.
Per importare un database si può utilizzare 'pandas.read_csv'.
Per informazioni consultare: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
"""

data = pd.read_csv('datasets/winequality-red.csv', sep = ';')
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
5,7.4,0.660,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5
6,7.9,0.600,0.06,1.6,0.069,15.0,59.0,0.99640,3.30,0.46,9.4,5
7,7.3,0.650,0.00,1.2,0.065,15.0,21.0,0.99460,3.39,0.47,10.0,7
8,7.8,0.580,0.02,2.0,0.073,9.0,18.0,0.99680,3.36,0.57,9.5,7
9,7.5,0.500,0.36,6.1,0.071,17.0,102.0,0.99780,3.35,0.80,10.5,5


In [5]:
"""
DataFrame.head(n)

    Return the first n rows.

Parameters:
    n : int, default 5
        Number of rows to select.
Returns:
    obj_head : type of caller
    The first n rows of the caller object.
"""
first_ten_rows = data.head(n=10)

"""
DataFrame.tail(n)

    Return the last n rows.

Parameters:
    n : int, default 5
        Number of rows to select.
Returns:
    obj_tail : type of caller
    The last n rows of the caller object.
"""
last_ten_rows = data.tail(n=10)

In [6]:
"""
DataFrame.shape
    
    Return a tuple representing the dimensionality of the DataFrame.
"""
data.shape
"""

DataFrame.dtypes

    Return the dtypes in this object.
"""
data.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [7]:
"""
DataFrame.describe(percentiles=None, include=None, exclude=None)
    Generates descriptive statistics that summarize the central tendency, dispersion and shape 
    of a dataset’s distribution, excluding NaN values.

    Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types.
    The output will vary depending on what is provided. Refer to the notes below for more detail.

Parameters:
    percentiles : list-like of numbers, optional
        The percentiles to include in the output. All should fall between 0 and 1. 
        The default is [.25, .5, .75], which returns the 25th, 50th, and 75th percentiles.
        include : ‘all’, list-like of dtypes or None (default), optional
        A white list of data types to include in the result. Ignored for Series. 
        Here are the options:
            ‘all’ : All columns of the input will be included in the output.
            A list-like of dtypes : Limits the results to the provided data types. 
                To limit the result to numeric types submit numpy.number. 
                To limit it instead to object columns submit the numpy.object data type. 
                Strings can also be used in the style of select_dtypes (e.g. df.describe(include=['O'])). 
                To select pandas categorical columns, use 'category'
            None (default) : The result will include all numeric columns.
    exclude : list-like of dtypes or None (default), optional,
            A black list of data types to omit from the result. Ignored for Series. Here are the options:
                A list-like of dtypes : Excludes the provided data types from the result. 
                    To exclude numeric types submit numpy.number. To exclude object columns 
                    submit the data type numpy.object. Strings can also be used in the style of select_dtypes (e.g. df.describe(include=['O'])). To exclude pandas categorical columns, use 'category'
            None (default) : The result will exclude nothing.
            
Returns:
    summary: Series/DataFrame of summary statistics
"""
by_pH = data.groupby('pH')
describe_pH = by_pH['alcohol'].describe()
print("Describe pH: {}".format(describe_pH.head(3)))

Describe pH:       count  mean  std   min   25%   50%   75%   max
pH                                                  
2.74    1.0   9.4  NaN   9.4   9.4   9.4   9.4   9.4
2.86    1.0   8.4  NaN   8.4   8.4   8.4   8.4   8.4
2.87    1.0  10.2  NaN  10.2  10.2  10.2  10.2  10.2


In [8]:
"""
DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, 
                squeeze=False, **kwargs)
Group series using mapper (dict or key function, apply given function to group, return result as series)
or by a series of columns.

Parameters:
    by : mapping, function, str, or iterable
        Used to determine the groups for the groupby. If by is a function, it’s called on each
        value of the object’s index. If a dict or Series is passed, the Series or dict VALUES
        will be used to determine the groups (the Series’ values are first aligned; see .align() method).
        If an ndarray is passed, the values are used as-is determine the groups. A str or list 
        of strs may be passed to group by the columns in self
    axis : int, default 0
    level : int, level name, or sequence of such, default None
        If the axis is a MultiIndex (hierarchical), group by a particular level or levels
    as_index : boolean, default True
        For aggregated output, return object with group labels as the index. Only relevant 
        for DataFrame input. as_index=False is effectively “SQL-style” grouped output
    sort : boolean, default True
        Sort group keys. Get better performance by turning this off. Note this does not influence 
        the order of observations within each group. groupby preserves the order of rows within each group.
    group_keys : boolean, default True
        When calling apply, add group keys to index to identify pieces
    squeeze : boolean, default False
        reduce the dimensionality of the return type if possible, otherwise return a consistent type

Returns:
    GroupBy object
"""

fixed_acidity_mean = data.groupby(by='fixed acidity')['residual sugar'].mean()
print("Media Fixed Acidity: {}\n".format(fixed_acidity_mean.head(10)))
alcohol_quality = data.groupby(by=['alcohol', 'quality'], squeeze=True).size()
print("Relazione tra gradiazione alcolica e qualità{}\n", alcohol_quality)

Media Fixed Acidity: fixed acidity
4.6    2.100000
4.7    2.300000
4.9    2.100000
5.0    2.016667
5.1    1.725000
5.2    1.908333
5.3    1.900000
5.4    1.600000
5.5    1.800000
5.6    2.800000
Name: residual sugar, dtype: float64

Relazione tra gradiazione alcolica e qualità{}
 alcohol    quality
8.400000   3           1
           6           1
8.500000   5           1
8.700000   6           2
8.800000   5           2
9.000000   3           1
           4           2
           5          11
           6          16
9.050000   4           1
9.100000   4           2
           5          14
           6           7
9.200000   4           3
           5          50
           6          17
           7           2
9.233333   6           1
9.250000   6           1
9.300000   4           2
           5          44
           6          13
9.400000   4           2
           5          79
           6          22
9.500000   5          97
           6          40
           7           2


In [9]:
"""
DataFrame.corr(method='pearson', min_periods=1)
    Compute pairwise correlation of columns, excluding NA/null values

Parameters:
    - method : {‘pearson’, ‘kendall’, ‘spearman’}
        pearson : standard correlation coefficient
        kendall : Kendall Tau correlation coefficient
        spearman : Spearman rank correlation
    - min_periods : int, optional
        Minimum number of observations required per pair of columns to have a valid result. 
        Currently only available for pearson and spearman correlation
Returns:
    y : DataFrame
"""
pd.set_option('display.width',100)
pd.set_option('precision', 3)
correlations = data.corr(method='pearson')
print("Correlations: \n{}".format(correlations))

Correlations: 
                      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
fixed acidity                 1.000            -0.256        0.672           0.115      0.094   
volatile acidity             -0.256             1.000       -0.552           0.002      0.061   
citric acid                   0.672            -0.552        1.000           0.144      0.204   
residual sugar                0.115             0.002        0.144           1.000      0.056   
chlorides                     0.094             0.061        0.204           0.056      1.000   
free sulfur dioxide          -0.154            -0.011       -0.061           0.187      0.006   
total sulfur dioxide         -0.113             0.076        0.036           0.203      0.047   
density                       0.668             0.022        0.365           0.355      0.201   
pH                           -0.683             0.235       -0.542          -0.086     -0.265   
sulphates      

In [14]:
"""
DataFrame.skew(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)0
    Return unbiased skew over requested axis Normalized by N-1

Parameters:
    - axis : {index (0), columns (1)}
    - skipna : boolean, default True
        Exclude NA/null values. If an entire row/column is NA or empty, the result will be NA
    - level : int or level name, default None
        If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a Series
    - numeric_only : boolean, default None
        Include only float, int, boolean columns. If None, will attempt to use everything, 
        then use only numeric data. Not implemented for Series.

Returns:
    - skew : Series or DataFrame (if level specified)
"""
skew = data.skew(axis=0)
skew

fixed acidity           0.983
volatile acidity        0.672
citric acid             0.318
residual sugar          4.541
chlorides               5.680
free sulfur dioxide     1.251
total sulfur dioxide    1.516
density                 0.071
pH                      0.194
sulphates               2.429
alcohol                 0.861
quality                 0.218
dtype: float64