### Day 1: NumPy Basics

In [40]:
# Importing numpy library
import numpy as np

In [1]:
# Importing pandas library
import pandas as pd

In [41]:
# Creating an array from list
lst = [1,2,3,4,5]
np_arr = np.array(lst)
np_arr

array([1, 2, 3, 4, 5])

In [42]:
# Printing the data type
print(type(np_arr))

<class 'numpy.ndarray'>


In [43]:
# Accessing elements in the array by index
np_arr[3]

4

In [44]:
# Multiplying the array with a constant
np_arr2 = np_arr * 2
np_arr2

array([ 2,  4,  6,  8, 10])

In [45]:
# Selection using condition
a = np_arr < 3
print(a)

[ True  True False False False]


In [46]:
# Selection using square brackets, SUBSETTING
np_arr[a]

array([1, 2])

In [47]:
# Arithmetic operators on numpy arrays
np.array([True, 1, 2]) + np.array([3, 4, False])

array([4, 5, 2])

In [48]:
# Elements with different data types, TYPE COERCION
np.array([1,4,'Hi',True])

array(['1', '4', 'Hi', 'True'], dtype='<U11')

### Day 2: NumPy

In [49]:
np_arr

array([1, 2, 3, 4, 5])

In [50]:
# Slicing
print(np_arr[1:3])

[2 3]


In [51]:
# Dictionary to numpy arrays
std = {1:'Ram',2:'Sita',3:'Gita'}
np_std = np.array(std)
np_std

array({1: 'Ram', 2: 'Sita', 3: 'Gita'}, dtype=object)

In [52]:
# Verifying type
print(type(np_std))

<class 'numpy.ndarray'>


In [53]:
# Finding size
np_std.size

1

In [54]:
# Finding data type
np_std.dtype

dtype('O')

In [55]:
# Accessing specific element
extracted_dict = np_std.item()
extracted_dict[2]

'Sita'

In [56]:
# Creating 2D Array
lst_2d = [[1,2,3],[4,5,6]]
np_2d = np.array(lst_2d)
np_2d

array([[1, 2, 3],
       [4, 5, 6]])

In [57]:
# Accessing specific element in 2d array
lst_2d[0][2]

3

### Day 3: NumPy

In [58]:
# Create an array filled with zeroes
np.zeros([2,2],int)

array([[0, 0],
       [0, 0]])

In [59]:
# Create an array filled with zeroes
np.zeros((2,2), dtype = int)

array([[0, 0],
       [0, 0]])

In [60]:
# Create an array filled with ones
np.ones((4,5), int)

array([[1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [61]:
# Creating an array filled with custom elements
np.full((3,3),7,int)

array([[7, 7, 7],
       [7, 7, 7],
       [7, 7, 7]])

In [62]:
# Creating an empty array with 2 elements
np.empty(2) # might vary

array([1.34221126e-311, 0.00000000e+000])

In [63]:
# Using arange
try_arange = np.arange(4,13,3)
try_arange

array([ 4,  7, 10])

In [64]:
# Sorting the elements
np.sort(try_arange)

array([ 4,  7, 10])

In [65]:
# Concatenate Error: arrays must have same dimension
a = np.array([[1,2,3],[4,5,6]])
b = np.array([7,8,9])
np.concatenate((a,b))

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

In [66]:
# Concatenate 2d array
a = np.array([[1,2,3],[4,5,6]])
b = np.array([[7,8,9]])
x = np.concatenate((a,b))
print(x)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [67]:
# Arithmetic operators on 2d array
a + b

array([[ 8, 10, 12],
       [11, 13, 15]])

In [68]:
# Concatenate 1d array
c = np.array([1, 2, 3, 4])
d = np.array([5, 6, 7, 8])
np.concatenate((c,d))

array([1, 2, 3, 4, 5, 6, 7, 8])

In [69]:
# Arithmetic operators on 1d array
c + d

array([ 6,  8, 10, 12])

In [70]:
x

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [71]:
# Finding dimension
x.ndim

2

In [72]:
# Finding shape
x.shape

(3, 3)

In [73]:
# Finding size
x.size

9

In [74]:
# Creating an array to reshape
old_array = np.arange(8)
old_array

array([0, 1, 2, 3, 4, 5, 6, 7])

In [75]:
# Reshaping the existing array
new_array = np.reshape(old_array, (4,2))
new_array

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [76]:
# Reshaping Error: No. of elements specified not matched with the np. of elements in the original array
new_array = np.reshape(old_array, (2,2))
new_array

ValueError: cannot reshape array of size 8 into shape (2,2)

#### Basic Array Operations

In [77]:
c, d

(array([1, 2, 3, 4]), array([5, 6, 7, 8]))

In [78]:
c + d

array([ 6,  8, 10, 12])

In [79]:
c - d

array([-4, -4, -4, -4])

In [80]:
c * d

array([ 5, 12, 21, 32])

In [81]:
c/d

array([0.2       , 0.33333333, 0.42857143, 0.5       ])

In [82]:
c%d

array([1, 2, 3, 4])

In [83]:
c**d

array([    1,    64,  2187, 65536])

### Day 4: Pandas

In [85]:
# DataFrame
pd.DataFrame({'Eng': [50, 21], 'Maths': [11, 56]})

Unnamed: 0,Eng,Maths
0,50,11
1,21,56


In [86]:
# Assigning index
pd.DataFrame({'Ram': ['I liked it.', 'It was awful.'], 
              'Shyam': ['Pretty good.', 'Bland.']},
             index=['Icecream', 'Pizza'])

Unnamed: 0,Ram,Shyam
Icecream,I liked it.,Pretty good.
Pizza,It was awful.,Bland.


In [87]:
# Series
pd.Series([1, 2, 3, 4, 5])

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [88]:
# Assigning index
pd.Series([1, 2, 3, 4, 5], index = [2020,2021,2022,2023,2024])

2020    1
2021    2
2022    3
2023    4
2024    5
dtype: int64

In [90]:
# Reading data files
df = pd.read_csv('info.csv')
df

Unnamed: 0,Name,Age,Sex
0,Ram,29.0,M
1,Shyam,22.0,M
2,Sita,27.0,F
3,Hari,27.0,M
4,Rita,24.0,F
5,Gita,,F


In [91]:
# head command
df.head()

Unnamed: 0,Name,Age,Sex
0,Ram,29.0,M
1,Shyam,22.0,M
2,Sita,27.0,F
3,Hari,27.0,M
4,Rita,24.0,F


In [92]:
df.head(2)

Unnamed: 0,Name,Age,Sex
0,Ram,29.0,M
1,Shyam,22.0,M


In [93]:
df.shape

(6, 3)

In [94]:
df.size

18

In [96]:
df.ndim

2

### Day 5: Pandas

In [15]:
df = pd.read_excel('coalpublic2013.xlsx')

In [16]:
df.head()

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours
0,2013,103381,Tacoa Highwall Miner,56004,22392
1,2013,103404,Reid School Mine,28807,28447
2,2013,100759,North River #1 Underground Min,1440115,474784
3,2013,103246,Bear Creek,87587,29193
4,2013,103451,Knight Mine,147499,46393


In [17]:
df.dtypes

Year            int64
MSHA ID         int64
Mine_Name      object
Production      int64
Labor_Hours     int64
dtype: object

In [18]:
df.dtypes == 'object'

Year           False
MSHA ID        False
Mine_Name       True
Production     False
Labor_Hours    False
dtype: bool

In [19]:
df.columns # returns column labels of the DataFrame

Index(['Year', 'MSHA ID', 'Mine_Name', 'Production', 'Labor_Hours'], dtype='object')

In [20]:
df.describe() # returns description of the data in the DataFrame

Unnamed: 0,Year,MSHA ID,Production,Labor_Hours
count,50.0,50.0,50.0,50.0
mean,2013.0,231168.8,557086.5,221976.3
std,0.0,716784.0,1368744.0,457778.0
min,2013.0,100329.0,0.0,220.0
25%,2013.0,103155.0,25992.25,24393.75
50%,2013.0,103337.0,129638.0,73646.0
75%,2013.0,103408.5,295516.0,161001.5
max,2013.0,5000030.0,7602722.0,2464719.0


In [22]:
# Adding a new column
df['new_col'] = df['Production'] + df['Labor_Hours']

In [23]:
df.head()

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours,new_col
0,2013,103381,Tacoa Highwall Miner,56004,22392,78396
1,2013,103404,Reid School Mine,28807,28447,57254
2,2013,100759,North River #1 Underground Min,1440115,474784,1914899
3,2013,103246,Bear Creek,87587,29193,116780
4,2013,103451,Knight Mine,147499,46393,193892


In [24]:
pd.Categorical(df['Year']) # Outputs the unique categories present in the 'Year' column

[2013, 2013, 2013, 2013, 2013, ..., 2013, 2013, 2013, 2013, 2013]
Length: 50
Categories (1, int64): [2013]

In [25]:
# Access unique values in a column
df['Year'].unique()

array([2013], dtype=int64)

In [26]:
df['Labor_Hours'] > 150000

0     False
1     False
2      True
3     False
4     False
5     False
6     False
7      True
8     False
9      True
10     True
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24     True
25     True
26    False
27    False
28     True
29    False
30    False
31    False
32     True
33    False
34     True
35    False
36    False
37    False
38    False
39    False
40    False
41     True
42    False
43     True
44    False
45    False
46    False
47     True
48     True
49     True
Name: Labor_Hours, dtype: bool

In [27]:
df[df['Labor_Hours'] > 150000]

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours,new_col
2,2013,100759,North River #1 Underground Min,1440115,474784,1914899
7,2013,100851,Oak Grove Mine,2269014,1001809,3270823
9,2013,102901,Shoal Creek Mine,1453024,1237415,2690439
10,2013,103180,Sloan Mountain Mine,327780,196963,524743
24,2013,101247,No 4 Mine,2622528,1551141,4173669
25,2013,101401,No 7 Mine,5405412,2464719,7870131
28,2013,103303,Shannon Mine,317491,164388,481879
32,2013,100347,Choctaw Mine,537429,215295,752724
34,2013,102996,Jap Creek Mine,375715,164093,539808
41,2013,103376,Town Creek,299167,176499,475666


In [28]:
len(df[df['Labor_Hours'] > 150000])

14

In [29]:
df[(df['Labor_Hours'] > 150000) & (df['Production'] > 1000000)]

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours,new_col
2,2013,100759,North River #1 Underground Min,1440115,474784,1914899
7,2013,100851,Oak Grove Mine,2269014,1001809,3270823
9,2013,102901,Shoal Creek Mine,1453024,1237415,2690439
24,2013,101247,No 4 Mine,2622528,1551141,4173669
25,2013,101401,No 7 Mine,5405412,2464719,7870131
48,2013,5000030,Usibelli,1631584,286079,1917663
49,2013,201195,Kayenta Mine,7602722,1015333,8618055


### Day 6: Pandas

In [30]:
df.head()

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours,new_col
0,2013,103381,Tacoa Highwall Miner,56004,22392,78396
1,2013,103404,Reid School Mine,28807,28447,57254
2,2013,100759,North River #1 Underground Min,1440115,474784,1914899
3,2013,103246,Bear Creek,87587,29193,116780
4,2013,103451,Knight Mine,147499,46393,193892


In [31]:
# Check for missing values i.e N/A
df.isna()

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours,new_col
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [32]:
# Count the total no. of missing values
df.isna().sum()

Year           0
MSHA ID        0
Mine_Name      0
Production     0
Labor_Hours    0
new_col        0
dtype: int64

In [33]:
# Remove missing values
df.dropna()

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours,new_col
0,2013,103381,Tacoa Highwall Miner,56004,22392,78396
1,2013,103404,Reid School Mine,28807,28447,57254
2,2013,100759,North River #1 Underground Min,1440115,474784,1914899
3,2013,103246,Bear Creek,87587,29193,116780
4,2013,103451,Knight Mine,147499,46393,193892
5,2013,103433,Crane Central Mine,69339,47195,116534
6,2013,100329,Concord Mine,0,144002,144002
7,2013,100851,Oak Grove Mine,2269014,1001809,3270823
8,2013,102901,Shoal Creek Mine,0,12396,12396
9,2013,102901,Shoal Creek Mine,1453024,1237415,2690439


In [54]:
# Replace missing values with the given value
df.fillna('abc')

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours
0,2013,103381,Tacoa Highwall Miner,56004,22392
1,2013,103404,Reid School Mine,28807,28447
2,2013,100759,North River #1 Underground Min,1440115,474784
3,2013,103246,Bear Creek,87587,29193
4,2013,103451,Knight Mine,147499,46393
5,2013,103433,Crane Central Mine,69339,47195
6,2013,100329,Concord Mine,0,144002
7,2013,100851,Oak Grove Mine,2269014,1001809
8,2013,102901,Shoal Creek Mine,0,12396
9,2013,102901,Shoal Creek Mine,1453024,1237415


In [35]:
df.columns

Index(['Year', 'MSHA ID', 'Mine_Name', 'Production', 'Labor_Hours', 'new_col'], dtype='object')

In [36]:
# Remove a column
df.drop(columns=['new_col'])

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours
0,2013,103381,Tacoa Highwall Miner,56004,22392
1,2013,103404,Reid School Mine,28807,28447
2,2013,100759,North River #1 Underground Min,1440115,474784
3,2013,103246,Bear Creek,87587,29193
4,2013,103451,Knight Mine,147499,46393
5,2013,103433,Crane Central Mine,69339,47195
6,2013,100329,Concord Mine,0,144002
7,2013,100851,Oak Grove Mine,2269014,1001809
8,2013,102901,Shoal Creek Mine,0,12396
9,2013,102901,Shoal Creek Mine,1453024,1237415


In [55]:
df.head()

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours
0,2013,103381,Tacoa Highwall Miner,56004,22392
1,2013,103404,Reid School Mine,28807,28447
2,2013,100759,North River #1 Underground Min,1440115,474784
3,2013,103246,Bear Creek,87587,29193
4,2013,103451,Knight Mine,147499,46393


In [39]:
# Remove a column in the original dataset
df.drop(columns=['new_col'], inplace=True)

In [40]:
df.head()

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours
0,2013,103381,Tacoa Highwall Miner,56004,22392
1,2013,103404,Reid School Mine,28807,28447
2,2013,100759,North River #1 Underground Min,1440115,474784
3,2013,103246,Bear Creek,87587,29193
4,2013,103451,Knight Mine,147499,46393


In [47]:
# Remove a row by their index
df.drop(49, inplace = True)

In [48]:
df.tail()

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours
44,2013,103423,Dutton Hill Mine,37275,9162
45,2013,1519322,Ghm #25,25054,3108
46,2013,103321,Poplar Springs,189370,76366
47,2013,103358,Old Union,284563,161805
48,2013,5000030,Usibelli,1631584,286079


In [52]:
# Replace values in a row
df['Mine_Name'].replace('Old Union','OU',inplace=True)

In [53]:
df.tail()

Unnamed: 0,Year,MSHA ID,Mine_Name,Production,Labor_Hours
44,2013,103423,Dutton Hill Mine,37275,9162
45,2013,1519322,Ghm #25,25054,3108
46,2013,103321,Poplar Springs,189370,76366
47,2013,103358,OU,284563,161805
48,2013,5000030,Usibelli,1631584,286079
