# Overview of Numpy, Pandas and Plotly for Data Science

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates 

from pandas_datareader import data as web

# Pandas

In [45]:
# Read data from CSV
df = pd.read_csv("/kaggle/input/tesla-stock-data-updated-till-28jun2021/TSLA.csv")

**If Reading from an excel file**
```python
df = pd.read_excel("Path")
```

In [67]:
# Read data from HTML Table
df_HTML = pd.read_html("https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)")[2]
df_HTML.columns = df_HTML.columns.get_level_values(0)
df_HTML

Unnamed: 0,Country/Territory,UN region,IMF[1][13],IMF[1][13].1,World Bank[14],World Bank[14].1,United Nations[15],United Nations[15].1
0,World,—,105568776,2023,96513077,2021,85328323,2020
1,United States,Americas,26854599,2023,22996100,2021,23315081,2021
2,China,Asia,19373586,[n 1]2023,17734063,[n 3]2021,17734131,[n 1]2021
3,Japan,Asia,4409738,2023,4937422,2021,4940878,2021
4,Germany,Europe,4308854,2023,4223116,2021,4259935,2021
...,...,...,...,...,...,...,...,...
212,Anguilla,Americas,—,—,—,—,258,2020
213,Kiribati,Oceania,248,2023,207,2021,181,2020
214,Nauru,Oceania,151,2023,133,2021,135,2020
215,Montserrat,Americas,—,—,—,—,68,2020


In [68]:
population = pd.read_html("https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)", )[2]
population.columns = population.columns.get_level_values(1)
population

Unnamed: 0,Country/Territory,UN region,Estimate,Year,Estimate.1,Year.1,Estimate.2,Year.2
0,World,—,105568776,2023,96513077,2021,85328323,2020
1,United States,Americas,26854599,2023,22996100,2021,23315081,2021
2,China,Asia,19373586,[n 1]2023,17734063,[n 3]2021,17734131,[n 1]2021
3,Japan,Asia,4409738,2023,4937422,2021,4940878,2021
4,Germany,Europe,4308854,2023,4223116,2021,4259935,2021
...,...,...,...,...,...,...,...,...
212,Anguilla,Americas,—,—,—,—,258,2020
213,Kiribati,Oceania,248,2023,207,2021,181,2020
214,Nauru,Oceania,151,2023,133,2021,135,2020
215,Montserrat,Americas,—,—,—,—,68,2020


In [69]:
# Removing space from Column names 
population.columns = [c.replace(' ', '_') for c in population.columns]
df_HTML.columns = [x.replace(' ', '_') for x in df_HTML.columns]

In [73]:
# Cleaning up the column names using RegEx
df_HTML.columns = df_HTML.columns.str.replace(r"\[.*\]","")

  df_HTML.columns = df_HTML.columns.str.replace(r"\[.*\]","")


In [76]:
df_HTML

Unnamed: 0,Country/Territory,UN_region,IMF,IMF.1,World_Bank,World_Bank.1,United_Nations,United_Nations.1
0,World,—,105568776,2023,96513077,2021,85328323,2020
1,United States,Americas,26854599,2023,22996100,2021,23315081,2021
2,China,Asia,19373586,[n 1]2023,17734063,[n 3]2021,17734131,[n 1]2021
3,Japan,Asia,4409738,2023,4937422,2021,4940878,2021
4,Germany,Europe,4308854,2023,4223116,2021,4259935,2021
...,...,...,...,...,...,...,...,...
212,Anguilla,Americas,—,—,—,—,258,2020
213,Kiribati,Oceania,248,2023,207,2021,181,2020
214,Nauru,Oceania,151,2023,133,2021,135,2020
215,Montserrat,Americas,—,—,—,—,68,2020


In [84]:
# Because in this data the Organization Year is also listed with the same column name will need to replace it
def add_col_name_year( df, col):
    counter = 0
    for i in range(len(df.columns)):
        if df.columns[i] == col:
            counter += 1
        if counter == 2:
            df.columns.values[i] = f"{col}_Year"

In [87]:
add_col_name_year(df_HTML, col = "IMF")
add_col_name_year(df_HTML, col = "World_Bank")
add_col_name_year(df_HTML, col = "United_Nations")

In [88]:
df_HTML.columns

Index(['Country/Territory', 'UN_region', 'IMF', 'IMF_Year', 'IMF_Year',
       'IMF_Year', 'IMF_Year', 'IMF_Year'],
      dtype='object')

In [None]:
# Cleaning up the data removing brackets etc. 
df_HTML.IMF = df_HTML.IMF.str.replace(r"\[.*\]","")

# Numpy

In [5]:
# Creating a Numpy array

c = [1,2,3,4]
n = np.array(c)
n

array([1, 2, 3, 4])

In [8]:
# Creat array from a given range
r = np.arange(1,5)
r

array([1, 2, 3, 4])

In [9]:
# Step range
sr = np.arange(1,10,2)
sr

array([1, 3, 5, 7, 9])

In [11]:
# Creating a Matrix

mat1 = np.zeros((4,4))
mat1

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [15]:
# Matrix of ones
mat2 = np.ones((3,4))
mat2

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [16]:
# Random Value matrix
mat3 = np.random.randint(0,50, (3,4))
mat3

array([[ 3, 26, 48, 33],
       [37, 42, 12, 34],
       [44, 38,  2, 31]])

In [17]:
# Linear Spaced Range
lin = np.linspace(1,10,5)
lin

array([ 1.  ,  3.25,  5.5 ,  7.75, 10.  ])

In [23]:
# Reshaping arrays

nparr = np.random.randint(1,25,16)
nparr

array([12, 16, 18,  3,  4, 24, 20, 17, 12, 20, 19, 10, 11, 11, 14, 20])

In [24]:
# Reshaping to 4 rows and 4 columns
nparr1 = nparr.reshape(4,4)
nparr1

array([[12, 16, 18,  3],
       [ 4, 24, 20, 17],
       [12, 20, 19, 10],
       [11, 11, 14, 20]])

In [26]:
# Reshaping into a 3D array
nparr3D = nparr.reshape(4,2,2)
nparr3D

array([[[12, 16],
        [18,  3]],

       [[ 4, 24],
        [20, 17]],

       [[12, 20],
        [19, 10]],

       [[11, 11],
        [14, 20]]])

In [31]:
mat3D = nparr3D.reshape(2,4,2) 
mat3D

array([[[12, 16],
        [18,  3],
        [ 4, 24],
        [20, 17]],

       [[12, 20],
        [19, 10],
        [11, 11],
        [14, 20]]])

In [32]:
# Getting a value -> to get 4
mat3D[0,2,0]

4

In [34]:
# Fi;ter array (getting all values greater or equal to 4)
print(mat3D)
mat3D >=4

[[[12 16]
  [18  3]
  [ 4 24]
  [20 17]]

 [[12 20]
  [19 10]
  [11 11]
  [14 20]]]


array([[[ True,  True],
        [ True, False],
        [ True,  True],
        [ True,  True]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True]]])

In [35]:
# Returning only values that meet a condition
mat3D[mat3D >= 4]

array([12, 16, 18,  4, 24, 20, 17, 12, 20, 19, 10, 11, 11, 14, 20])

In [42]:
# Setting a seed for reproducibility of results 
np.random.seed(0)

In [44]:
# Statistics 
ranmat = np.random.randint(0,100, 50)
print(ranmat)
print("Mean: ", ranmat.mean())
print("Std: ", ranmat.std())
print("Variance: ", ranmat.var())
print("Median: ", np.median(ranmat))
print("Min: ", ranmat.min())
print("Max: ", ranmat.max())

[75 55 28 34  0  0 36 53  5 38 17 79  4 42 58 31  1 65 41 57 35 11 46 82
 91  0 14 99 53 12 42 84 75 68  6 68 47  3 76 52 78 15 20 99 58 23 79 13
 85 48]
Mean:  44.02
Std:  29.411895552650122
Variance:  865.0596
Median:  44.0
Min:  0
Max:  99
