In [1]:
import pandas as pd
import numpy as np
print(np.__doc__)


NumPy
=====

Provides
  1. An array object of arbitrary homogeneous items
  2. Fast mathematical operations over arrays
  3. Linear Algebra, Fourier Transforms, Random Number Generation

How to use the documentation
----------------------------
Documentation is available in two forms: docstrings provided
with the code, and a loose standing reference guide, available from
`the NumPy homepage <https://numpy.org>`_.

We recommend exploring the docstrings using
`IPython <https://ipython.org>`_, an advanced Python shell with
TAB-completion and introspection capabilities.  See below for further
instructions.

The docstring examples assume that `numpy` has been imported as ``np``::

  >>> import numpy as np

Code snippets are indicated by three greater-than signs::

  >>> x = 42
  >>> x = x + 1

Use the built-in ``help`` function to view a function's docstring::

  >>> help(np.sort)
  ... # doctest: +SKIP

For some objects, ``np.info(obj)`` may provide additional help.  This is
particularly 

In [2]:
print(pd.__doc__)


pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point data.
  - Size mutability: columns can be inserted and deleted from DataFrame and
    higher dimensional objects
  - Automatic and explicit data alignment: objects can be explicitly aligned
    to a set of labels, or the user can simply ignore the labels and

In [3]:
# creating a synthetic student score data
col_name = [ 'CA1','CA2','CA3']
sit_no = np.arange(1,51)
np.random.seed(2)
score = np.random.randint(1,11,150)
score = score.reshape(50,3)
student_score = pd.DataFrame(data = score, index=sit_no, columns=col_name)
student_score

Unnamed: 0,CA1,CA2,CA3
1,9,9,7
2,3,9,8
3,3,2,6
4,5,5,6
5,8,4,7
6,5,4,8
7,7,2,4
8,6,9,5
9,7,4,10
10,3,1,5


In [4]:
# selecting a single column from dataframe
student_score['CA3']

1      7
2      8
3      6
4      6
5      7
6      8
7      4
8      5
9     10
10     5
11     2
12     3
13     8
14     9
15    10
16     1
17     9
18    10
19     7
20     4
21     2
22     2
23     6
24     5
25     5
26     4
27     6
28     1
29     6
30     5
31     2
32     9
33     3
34     2
35     2
36     2
37     1
38     6
39     2
40     2
41     9
42     8
43     2
44     3
45     5
46     2
47    10
48     3
49     8
50     9
Name: CA3, dtype: int32

In [5]:
# selecting multiple columns
student_score[['CA1','CA3']]

Unnamed: 0,CA1,CA3
1,9,7
2,3,8
3,3,6
4,5,6
5,8,7
6,5,8
7,7,4
8,6,5
9,7,10
10,3,5


In [6]:
 #selecting a row or index using a set index value
student_score.loc[15]

CA1     6
CA2    10
CA3    10
Name: 15, dtype: int32

In [7]:
#selecting multiple rows or index using a set index
student_score.loc[[15,13,41,5]]

Unnamed: 0,CA1,CA2,CA3
15,6,10,10
13,10,9,8
41,9,7,9
5,8,4,7


In [8]:
#selecting multiple index or rows using a default index
student_score.iloc[15+1]

CA1    1
CA2    3
CA3    9
Name: 17, dtype: int32

In [9]:
#selecting multiple rows or index using a default index
student_score.iloc[[15-1,13+1,41-3,5+4]]

Unnamed: 0,CA1,CA2,CA3
15,6,10,10
15,6,10,10
39,3,6,2
10,3,1,5


In [10]:
# creating a column at the end of the dataframe
np.random.seed(2)
student_score['Examscore'] = np.random.randint(25,71,50).reshape(50,1)
student_score

Unnamed: 0,CA1,CA2,CA3,Examscore
1,9,9,7,65
2,3,9,8,40
3,3,2,6,70
4,5,5,6,33
5,8,4,7,47
6,5,4,8,68
7,7,2,4,43
8,6,9,5,36
9,7,4,10,65
10,3,1,5,32


In [11]:
#creating a new column at any location on the dataframe
np.random.seed(2)
student_score.insert(loc=0, value=np.random.randint(13,17,50).reshape(50,1), column ='AGE')
student_score

Unnamed: 0,AGE,CA1,CA2,CA3,Examscore
1,13,9,9,7,65
2,16,3,9,8,40
3,14,3,2,6,70
4,13,5,5,6,33
5,15,8,4,7,47
6,16,5,4,8,68
7,15,7,2,4,43
8,16,6,9,5,36
9,13,7,4,10,65
10,16,3,1,5,32


In [12]:
student_score

Unnamed: 0,AGE,CA1,CA2,CA3,Examscore
1,13,9,9,7,65
2,16,3,9,8,40
3,14,3,2,6,70
4,13,5,5,6,33
5,15,8,4,7,47
6,16,5,4,8,68
7,15,7,2,4,43
8,16,6,9,5,36
9,13,7,4,10,65
10,16,3,1,5,32


In [13]:
# creating a new column from existing columns
student_score['Total_score'] = student_score['CA1'] + student_score['CA2'] + student_score['CA3'] + student_score['Examscore']
student_score

Unnamed: 0,AGE,CA1,CA2,CA3,Examscore,Total_score
1,13,9,9,7,65,90
2,16,3,9,8,40,60
3,14,3,2,6,70,81
4,13,5,5,6,33,49
5,15,8,4,7,47,66
6,16,5,4,8,68,85
7,15,7,2,4,43,56
8,16,6,9,5,36,56
9,13,7,4,10,65,86
10,16,3,1,5,32,41


In [14]:
def grade(totalscore):
    if totalscore>= 70 and totalscore<= 100:
        return 'A'
    elif totalscore>= 60 and totalscore<= 69.99:
        return 'B'
    elif totalscore>= 50 and totalscore<= 59.99:
        return 'C'
    elif totalscore>= 40 and totalscore<= 49.99:
        return 'D'
    elif totalscore>= 31 and totalscore<= 39.99:
        return 'E'
    else:
        return 'F'
student_score['GRADE'] = student_score['Total_score'].apply(grade)
student_score

Unnamed: 0,AGE,CA1,CA2,CA3,Examscore,Total_score,GRADE
1,13,9,9,7,65,90,A
2,16,3,9,8,40,60,B
3,14,3,2,6,70,81,A
4,13,5,5,6,33,49,D
5,15,8,4,7,47,66,B
6,16,5,4,8,68,85,A
7,15,7,2,4,43,56,C
8,16,6,9,5,36,56,C
9,13,7,4,10,65,86,A
10,16,3,1,5,32,41,D


In [15]:
# dropping a column or multiple columns from the datframe
#student_score.drop('column_name', axis = 0 or 1 inplace = true)
# student_score.drop(['A','C'], axis=0 or 1 inplace=true)

In [16]:
student_score.head()

Unnamed: 0,AGE,CA1,CA2,CA3,Examscore,Total_score,GRADE
1,13,9,9,7,65,90,A
2,16,3,9,8,40,60,B
3,14,3,2,6,70,81,A
4,13,5,5,6,33,49,D
5,15,8,4,7,47,66,B


In [17]:
student_score.tail()

Unnamed: 0,AGE,CA1,CA2,CA3,Examscore,Total_score,GRADE
46,14,7,4,2,40,53,C
47,16,9,6,10,66,91,A
48,15,6,5,3,70,84,A
49,13,8,9,8,33,58,C
50,13,4,5,9,42,60,B


In [18]:
student_score.shape

(50, 7)

In [19]:
student_score.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 1 to 50
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   AGE          50 non-null     int32 
 1   CA1          50 non-null     int32 
 2   CA2          50 non-null     int32 
 3   CA3          50 non-null     int32 
 4   Examscore    50 non-null     int32 
 5   Total_score  50 non-null     int32 
 6   GRADE        50 non-null     object
dtypes: int32(6), object(1)
memory usage: 3.3+ KB


In [20]:
student_score.describe()

Unnamed: 0,AGE,CA1,CA2,CA3,Examscore,Total_score
count,50.0,50.0,50.0,50.0,50.0,50.0
mean,14.76,5.78,5.82,5.22,48.94,65.76
std,1.204752,2.873453,2.854928,2.880547,14.53414,15.218893
min,13.0,1.0,1.0,1.0,27.0,39.0
25%,14.0,3.0,4.0,2.0,36.0,53.75
50%,15.0,6.0,6.0,5.0,48.0,65.0
75%,16.0,8.0,8.75,8.0,63.75,77.75
max,16.0,10.0,10.0,10.0,70.0,91.0


In [21]:
student_score.head().transpose()

Unnamed: 0,1,2,3,4,5
AGE,13,16,14,13,15
CA1,9,3,3,5,8
CA2,9,9,2,5,4
CA3,7,8,6,6,7
Examscore,65,40,70,33,47
Total_score,90,60,81,49,66
GRADE,A,B,A,D,B


In [22]:
student_score.columns

Index(['AGE', 'CA1', 'CA2', 'CA3', 'Examscore', 'Total_score', 'GRADE'], dtype='object')

In [23]:
student_score.rename({'CA1':'FIRST_CA', 'CA2': 'SECOND CA', 'CA3':'THIRD CA'}, axis=1, inplace=True)
student_score.head()

Unnamed: 0,AGE,FIRST_CA,SECOND CA,THIRD CA,Examscore,Total_score,GRADE
1,13,9,9,7,65,90,A
2,16,3,9,8,40,60,B
3,14,3,2,6,70,81,A
4,13,5,5,6,33,49,D
5,15,8,4,7,47,66,B


In [24]:
student_score['AGE'].loc[1] = 15 
student_score.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  student_score['AGE'].loc[1] = 15


Unnamed: 0,AGE,FIRST_CA,SECOND CA,THIRD CA,Examscore,Total_score,GRADE
1,15,9,9,7,65,90,A
2,16,3,9,8,40,60,B
3,14,3,2,6,70,81,A
4,13,5,5,6,33,49,D
5,15,8,4,7,47,66,B


In [25]:
student_score['GRADE'].unique()

array(['A', 'B', 'D', 'C', 'E'], dtype=object)

In [26]:
student_score['GRADE'].nunique()

5

In [27]:
student_score['GRADE'].value_counts()

GRADE
A    21
C    10
B     9
D     9
E     1
Name: count, dtype: int64

In [28]:
 df = pd.read_csv('sample_pivot.csv')
df.head()

Unnamed: 0,Date,Region,Type,Units,Sales
0,7/11/2020,East,Children's Clothing,18.0,306
1,9/23/2020,North,Children's Clothing,14.0,448
2,4/2/2020,South,Women's Clothing,17.0,425
3,2/28/2020,East,Children's Clothing,26.0,832
4,3/19/2020,West,Women's Clothing,3.0,33


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    1000 non-null   object 
 1   Region  1000 non-null   object 
 2   Type    1000 non-null   object 
 3   Units   911 non-null    float64
 4   Sales   1000 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 39.2+ KB


In [30]:
df.isnull().sum()

Date       0
Region     0
Type       0
Units     89
Sales      0
dtype: int64

In [31]:
#forward fill technique
df.tail()

Unnamed: 0,Date,Region,Type,Units,Sales
995,2/11/2020,East,Children's Clothing,35.0,735
996,12/25/2020,North,Men's Clothing,,1155
997,8/31/2020,South,Men's Clothing,13.0,208
998,8/23/2020,South,Women's Clothing,17.0,493
999,8/17/2020,North,Women's Clothing,25.0,300


In [32]:
df1 = df.copy()

In [33]:
df1.fillna(method = 'ffill', inplace=True)
df1.tail()

  df1.fillna(method = 'ffill', inplace=True)


Unnamed: 0,Date,Region,Type,Units,Sales
995,2/11/2020,East,Children's Clothing,35.0,735
996,12/25/2020,North,Men's Clothing,35.0,1155
997,8/31/2020,South,Men's Clothing,13.0,208
998,8/23/2020,South,Women's Clothing,17.0,493
999,8/17/2020,North,Women's Clothing,25.0,300


In [34]:
df1.fillna(method = 'bfill', inplace=True)
df1.tail()

  df1.fillna(method = 'bfill', inplace=True)


Unnamed: 0,Date,Region,Type,Units,Sales
995,2/11/2020,East,Children's Clothing,35.0,735
996,12/25/2020,North,Men's Clothing,35.0,1155
997,8/31/2020,South,Men's Clothing,13.0,208
998,8/23/2020,South,Women's Clothing,17.0,493
999,8/17/2020,North,Women's Clothing,25.0,300


In [39]:
#statistical method
df2 = df.copy()
df2['Units'].fillna(df2['Units'].mean(),inplace=True)
df2.tail()

Unnamed: 0,Date,Region,Type,Units,Sales
995,2/11/2020,East,Children's Clothing,35.0,735
996,12/25/2020,North,Men's Clothing,19.638858,1155
997,8/31/2020,South,Men's Clothing,13.0,208
998,8/23/2020,South,Women's Clothing,17.0,493
999,8/17/2020,North,Women's Clothing,25.0,300


In [40]:
#dropping missing values
df.dropna(inplace=True)
df.isnull().sum()

Date      0
Region    0
Type      0
Units     0
Sales     0
dtype: int64