In [2]:
import numpy as np
import pandas as pd

## Creating a pandas series

- pd.Series(`data`, `labels`)

- A pandas series can be created in two ways

1) By Passing the necessary parameters into the `pd.Series()`

2) By Passing a dictionary of key - value pairs to the `pd.Series()`

In [3]:
labels = [i for i in range(1, 6)]
values = ["USA", "China", "UK", "Japan", "India"]

# Creating a pandas series using two lists
series = pd.Series(data = values, index = labels)
print(series)

1      USA
2    China
3       UK
4    Japan
5    India
dtype: object


In [4]:
# Printing the Series using indexes
for i in labels:
    print(i, series[i])

1 USA
2 China
3 UK
4 Japan
5 India


In [5]:
# Creating a Series from a dictionary
dic = {1 : "Germany", 2 : "France", 3 : "Korea", 4 : "Canada", 5 : "Australia"}
series2 = pd.Series(dic)
print(series2)

1      Germany
2       France
3        Korea
4       Canada
5    Australia
dtype: object


In [6]:
for i in labels:
    print(i, series2[i])

1 Germany
2 France
3 Korea
4 Canada
5 Australia


In [7]:
# Arithematic on the pandas series
print(series + series2)

1        USAGermany
2       ChinaFrance
3           UKKorea
4       JapanCanada
5    IndiaAustralia
dtype: object


## Working with Pandas Dataframes

- A 2D datastructure which carries both the row and column wise information

### Creating a Dataframe
- pd.DataFrame(`data`, `rows`, `columns`)

In [8]:
# Data to be stored in the dataframe
data = np.random.randn(5, 5)

# Creating the dataframe
df = pd.DataFrame(data, index = [i for i in range(5)], columns = ["A", "B", "C", "D", "E"])
print(df)

          A         B         C         D         E
0 -1.015657  0.255399  1.450900  0.907479 -0.912368
1  0.019332 -0.552564  1.880839 -1.425081 -0.562465
2 -0.283758 -1.185107 -0.922311 -0.786755  0.085414
3  1.273420 -0.372467 -0.948524  0.685333 -1.419774
4 -0.626735  0.539313  0.430386 -1.799865  1.197431


### Accessing a column from a dataframe
- df[`column`]

In [9]:
# Viewing a column of the dataframe
print(df["A"])

0   -1.015657
1    0.019332
2   -0.283758
3    1.273420
4   -0.626735
Name: A, dtype: float64


### Accessing a subset of the dataframe
- df[[`Columns`]]

In [10]:
# Viewing a subset of the dataframe
print(df[["A", "C", "E"]])

          A         C         E
0 -1.015657  1.450900 -0.912368
1  0.019332  1.880839 -0.562465
2 -0.283758 -0.922311  0.085414
3  1.273420 -0.948524 -1.419774
4 -0.626735  0.430386  1.197431


### Deleting a column
- df.drop(`column`, `axis`, `inplace`)

In [11]:
# Dropping a column from the dataframe
print("Before Dropping E")
print(df)

df.drop("E", axis = 1)

print("\n\nAfter Dropping E - No Change as Inplace = False")
print(df)

Before Dropping E
          A         B         C         D         E
0 -1.015657  0.255399  1.450900  0.907479 -0.912368
1  0.019332 -0.552564  1.880839 -1.425081 -0.562465
2 -0.283758 -1.185107 -0.922311 -0.786755  0.085414
3  1.273420 -0.372467 -0.948524  0.685333 -1.419774
4 -0.626735  0.539313  0.430386 -1.799865  1.197431


After Dropping E - No Change as Inplace = False
          A         B         C         D         E
0 -1.015657  0.255399  1.450900  0.907479 -0.912368
1  0.019332 -0.552564  1.880839 -1.425081 -0.562465
2 -0.283758 -1.185107 -0.922311 -0.786755  0.085414
3  1.273420 -0.372467 -0.948524  0.685333 -1.419774
4 -0.626735  0.539313  0.430386 -1.799865  1.197431


In [12]:
# Dropping a column from the dataframe
print("Before Dropping E")
print(df)

df.drop("E", axis = 1, inplace = True)

print("\n\nAfter Dropping E - Permanent Change as Inplace = True")
print(df)

Before Dropping E
          A         B         C         D         E
0 -1.015657  0.255399  1.450900  0.907479 -0.912368
1  0.019332 -0.552564  1.880839 -1.425081 -0.562465
2 -0.283758 -1.185107 -0.922311 -0.786755  0.085414
3  1.273420 -0.372467 -0.948524  0.685333 -1.419774
4 -0.626735  0.539313  0.430386 -1.799865  1.197431


After Dropping E - Permanent Change as Inplace = True
          A         B         C         D
0 -1.015657  0.255399  1.450900  0.907479
1  0.019332 -0.552564  1.880839 -1.425081
2 -0.283758 -1.185107 -0.922311 -0.786755
3  1.273420 -0.372467 -0.948524  0.685333
4 -0.626735  0.539313  0.430386 -1.799865


### Selecting rows of a dataframe

- df.loc[`row`]: Using the row labels

- df.iloc[`row index`]: Using the integer index values of all the rows

In [17]:
print("Printing the second row:")
print(df.loc[1], "\n\n")

print("Printing the third row:")
print(df.iloc[2])

Printing the second row:
A    0.019332
B   -0.552564
C    1.880839
D   -1.425081
Name: 1, dtype: float64 


Printing the third row:
A   -0.283758
B   -1.185107
C   -0.922311
D   -0.786755
Name: 2, dtype: float64


### Accessing a specific value from the data frame

- df.loc[`row`, `column`]

In [19]:
print(df, "\n\n")

print("The 3rd Row and 3rd Column of the DataFrame: ", df.loc[2, "C"])

          A         B         C         D
0 -1.015657  0.255399  1.450900  0.907479
1  0.019332 -0.552564  1.880839 -1.425081
2 -0.283758 -1.185107 -0.922311 -0.786755
3  1.273420 -0.372467 -0.948524  0.685333
4 -0.626735  0.539313  0.430386 -1.799865 


The 3rd Row and 3rd Column of the DataFrame:  -0.9223110156240545


### Accessing a subset of the dataframe

- df.loc[[`rows`], [`columns`]]

In [20]:
print(df.loc[[1, 2], ["A", "D"]])

          A         D
1  0.019332 -1.425081
2 -0.283758 -0.786755


### Conditional Selection of the Dataframe

In [21]:
print(df)

          A         B         C         D
0 -1.015657  0.255399  1.450900  0.907479
1  0.019332 -0.552564  1.880839 -1.425081
2 -0.283758 -1.185107 -0.922311 -0.786755
3  1.273420 -0.372467 -0.948524  0.685333
4 -0.626735  0.539313  0.430386 -1.799865


In [22]:
print("Displaying only values greater than 0 in Column - C")
print(df["C"] > 0)

Displaying only values greater than 0 in Column - C
0     True
1     True
2    False
3    False
4     True
Name: C, dtype: bool


In [23]:
print("Using the conditional selection on C to display rows in the entire DataFrame")
print(df[df["C"] > 0])

Using the conditional selection on C to display rows in the entire DataFrame
          A         B         C         D
0 -1.015657  0.255399  1.450900  0.907479
1  0.019332 -0.552564  1.880839 -1.425081
4 -0.626735  0.539313  0.430386 -1.799865


### Multiple conditions in Pandas

In [24]:
print(df)

          A         B         C         D
0 -1.015657  0.255399  1.450900  0.907479
1  0.019332 -0.552564  1.880839 -1.425081
2 -0.283758 -1.185107 -0.922311 -0.786755
3  1.273420 -0.372467 -0.948524  0.685333
4 -0.626735  0.539313  0.430386 -1.799865


In [25]:
print("Displaying values of the dataframe where B and C are > 0")
print(df[(df["B"] > 0) & (df["C"] > 0)])

Displaying values of the dataframe where B and C are > 0
          A         B         C         D
0 -1.015657  0.255399  1.450900  0.907479
4 -0.626735  0.539313  0.430386 -1.799865


### Resetting the Index Values of a DataFrame

In [27]:
print(df.reset_index())

   index         A         B         C         D
0      0 -1.015657  0.255399  1.450900  0.907479
1      1  0.019332 -0.552564  1.880839 -1.425081
2      2 -0.283758 -1.185107 -0.922311 -0.786755
3      3  1.273420 -0.372467 -0.948524  0.685333
4      4 -0.626735  0.539313  0.430386 -1.799865


### Setting the values of Index to a column in the dataframe

In [28]:
df["States"] = ["Bengal", "Maharashtra", "Karnataka", "Andhra", "Tamil Nadu"]
print(df)

          A         B         C         D       States
0 -1.015657  0.255399  1.450900  0.907479       Bengal
1  0.019332 -0.552564  1.880839 -1.425081  Maharashtra
2 -0.283758 -1.185107 -0.922311 -0.786755    Karnataka
3  1.273420 -0.372467 -0.948524  0.685333       Andhra
4 -0.626735  0.539313  0.430386 -1.799865   Tamil Nadu


In [29]:
print(df.set_index("States"))

                    A         B         C         D
States                                             
Bengal      -1.015657  0.255399  1.450900  0.907479
Maharashtra  0.019332 -0.552564  1.880839 -1.425081
Karnataka   -0.283758 -1.185107 -0.922311 -0.786755
Andhra       1.273420 -0.372467 -0.948524  0.685333
Tamil Nadu  -0.626735  0.539313  0.430386 -1.799865


## Reading Data Using Pandas

- Reading a CSV using Pandas

- pd.read_csv(`path`)

In [38]:
data = pd.read_csv("../../Programming/digit-recognizer/train.csv")
print(data.head())

   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8  ...  pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  \
0       0  ...         0         0         0         0         0         0   
1       0  ...         0         0         0         0         0         0   
2       0  ...         0         0         0         0         0         0   
3       0  ...         0         0         0         0         0         0   
4       0  ...         0         0         0         0         0         0   

   pixel780  pixel781  pixel782  pixel783  
0         0         0         

## Using Group By in Pandas for Analysis

- df.groupby(`column`)

In [40]:
labels = data.groupby("label")

In [41]:
print(labels.count())

       pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
label                                                                           
0        4132    4132    4132    4132    4132    4132    4132    4132    4132   
1        4684    4684    4684    4684    4684    4684    4684    4684    4684   
2        4177    4177    4177    4177    4177    4177    4177    4177    4177   
3        4351    4351    4351    4351    4351    4351    4351    4351    4351   
4        4072    4072    4072    4072    4072    4072    4072    4072    4072   
5        3795    3795    3795    3795    3795    3795    3795    3795    3795   
6        4137    4137    4137    4137    4137    4137    4137    4137    4137   
7        4401    4401    4401    4401    4401    4401    4401    4401    4401   
8        4063    4063    4063    4063    4063    4063    4063    4063    4063   
9        4188    4188    4188    4188    4188    4188    4188    4188    4188   

       pixel9  ...  pixel77

In [44]:
print("All the unique values in Labels Column: ", data["label"].unique())

All the unique values in Labels Column:  [1 0 4 7 3 5 8 9 2 6]


In [46]:
print("The count of all the unique values in Labels Column: ", data["label"].value_counts())

The count of all the unique values in Labels Column:  1    4684
7    4401
3    4351
9    4188
2    4177
6    4137
0    4132
4    4072
8    4063
5    3795
Name: label, dtype: int64
