# Numpy

### Array
A data structure that is homogenous i.e stores same type of data. This is where it is different from a list

In [1]:
import numpy as np

In [2]:
my_list = [1,2,3,4,5]
arr = np.array(my_list)
print(type(arr),'\n',arr)

<class 'numpy.ndarray'> 
 [1 2 3 4 5]


In [3]:
print(arr.shape) # check dimensions of an array

(5,)


In [8]:
list1 = [1,2,3,2,1]
list2 = [3,4,5,6,7]
list3 = [0,1,1,1,0]
arr2 = np.array([list1,list2,list3])
print(arr2.shape)
print(arr2)
print(arr2.reshape(5,3)) # reshapes the array based on parameters provided

(3, 5)
[[1 2 3 2 1]
 [3 4 5 6 7]
 [0 1 1 1 0]]
[[1 2 3]
 [2 1 3]
 [4 5 6]
 [7 0 1]
 [1 1 0]]


In [12]:
# Indexing
print(arr[3]) # returns the fourth element of the array
print(arr2[:,:]) # the comma inside [] separates the rows and columns 
print(arr2[0:2,2:4]) # selects the rows 0,1(0:2) and columns 2,3(2:4)
print(arr2[2:,2:4])
print(arr2[1,1:4])


4
[[1 2 3 2 1]
 [3 4 5 6 7]
 [0 1 1 1 0]]
[[3 2]
 [5 6]]
[[1 1]]
[4 5 6]


In [13]:
# Functions for an array
arr3 = np.arange(0,15,step=3) # creates an array from 0 to 14 with steps=3;steps is optional
print(arr3)
arr4 = np.linspace(1,10,25) # creates an array from 1 to 10 with 25 digits equidistant from each other
print(arr4)

[ 0  3  6  9 12]
[ 1.     1.375  1.75   2.125  2.5    2.875  3.25   3.625  4.     4.375
  4.75   5.125  5.5    5.875  6.25   6.625  7.     7.375  7.75   8.125
  8.5    8.875  9.25   9.625 10.   ]


In [16]:
# copy and broadcast
org_arr = np.arange(0,10)
print("org_arr - ",org_arr)
copy_arr = org_arr
copy_arr[4:] = 100
print("copy_arr - ",copy_arr)
print("org_arr - ",org_arr)
# Arrays are reference types i.e when we assign the array to a variable or passing it to a function, a reference to the array is being
# passed and not a copy of itself. This is done because it is more memory efficient
copy_arr2 = org_arr.copy() # this passes a copy of the array and not the reference
print('copy_arr2 - ',copy_arr2)
copy_arr2[4:] = 50
print("org_arr - ",org_arr)
print("copy_arr2 - ", copy_arr2)

org_arr -  [0 1 2 3 4 5 6 7 8 9]
copy_arr -  [  0   1   2   3 100 100 100 100 100 100]
org_arr -  [  0   1   2   3 100 100 100 100 100 100]
copy_arr2 -  [  0   1   2   3 100 100 100 100 100 100]
org_arr -  [  0   1   2   3 100 100 100 100 100 100]
copy_arr2 -  [ 0  1  2  3 50 50 50 50 50 50]


In [27]:
# Array operations
arr_new = np.arange(1,11)
print("arr_new - ",arr_new)
print("Checking values less than 3 - ",arr_new<3)
print("Printing only less than 5 values - ",arr_new[arr_new<5])
print("Multiplying each element by 3 - ",arr_new*3) # This is different than list. In a list the number of elements increases by 3 times
print("Multiplication in a list - ",[1,2,3,4,5]*2)
print("creates an array with only 1s of length 6",np.ones(6, dtype=int)) #dtype is optional, default is float
print("Creates a 2D array with 1s of float type - \n",np.ones((3,4),dtype=float))

arr_new -  [ 1  2  3  4  5  6  7  8  9 10]
Checking values less than 3 -  [ True  True False False False False False False False False]
Printing only less than 5 values -  [1 2 3 4]
Multiplying each element by 3 -  [ 3  6  9 12 15 18 21 24 27 30]
Multiplication in a list -  [1, 2, 3, 4, 5, 1, 2, 3, 4, 5]
creates an array with only 1s of length 6 [1 1 1 1 1 1]
Creates a 2D array with 1s of float type - 
 [[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


In [33]:
# Random distribution
print(np.random.rand(3,3)) #Returns random values of the given shape between values 0 and 1
print(np.random.randn(4,4)) #Returns an array with values in random distribution
print(np.random.randint(0,10,8)) # returns an array of size 8 with values b/w 0 and 100
print(np.random.randint(0,10,8).reshape(4,2)) # reshapes the array returned above in a 4x2 array
print(np.random.random_sample((2,3))) # Returns an array of size 2x3 with random float values

[[0.60553157 0.82224659 0.47918131]
 [0.07841325 0.94214785 0.29197856]
 [0.89335452 0.4071153  0.05891234]]
[[-0.99349247  1.6260104  -1.51630451  1.63954888]
 [-0.30553364  0.64295524 -0.86372012 -1.36892808]
 [ 0.71250001  1.19145773  0.7512883  -1.40764393]
 [ 0.11166254 -0.23487621  0.05008688 -0.96848723]]
[5 8 8 8 6 1 6 3]
[[0 1]
 [1 1]
 [9 9]
 [4 4]]
[[0.5218252  0.92448739 0.44380918]
 [0.00296655 0.3487566  0.9901818 ]]


### -------------------------------------

# Pandas

In [34]:
import pandas as pd
import numpy as np

In [35]:
df = pd.DataFrame(np.arange(0,20).reshape(5,4), 
                  index = ['Row1', 'Row2', 'Row3', 'Row4', 'Row5'],
                  columns = ['Col1', 'Col2', 'Col3', 'Col4'])
print(df.head())

      Col1  Col2  Col3  Col4
Row1     0     1     2     3
Row2     4     5     6     7
Row3     8     9    10    11
Row4    12    13    14    15
Row5    16    17    18    19


In [36]:
# Writing a dataframe to csv file
df.to_csv('Test.csv')

In [50]:
# Accessing the elements
# 1 - .loc ->  label-based indexing, need to specify row and column labels. 
# Syntax - df.loc[row_label, column_label]
print(df.loc['Row1']) # print a row with all the columns
print(df.loc[:,'Col2']) # specifying the rows is necesary, here all rows of Col will be printed
print(df.loc['Row3':'Row5','Col2':'Col4'])
# 2 - .iloc -> integer-based indexing.
# Syntax - df.iloc[row_index, column_index]
df.iloc[:1,2:]
# When slicing a df, the result is a series if there is one column. If there is more than 1 column it is a df

print("accessing a column\n",df['Col1'])

Col1    0
Col2    1
Col3    2
Col4    3
Name: Row1, dtype: int32
Row1     1
Row2     5
Row3     9
Row4    13
Row5    17
Name: Col2, dtype: int32
      Col2  Col3  Col4
Row3     9    10    11
Row4    13    14    15
Row5    17    18    19
accessing a column
 Row1     0
Row2     4
Row3     8
Row4    12
Row5    16
Name: Col1, dtype: int32


In [46]:
# Converting a df into an array
print(df.loc['Row3':'Row5','Col2':'Col4'].values) # the .values converts a df to an array
print(df.loc['Row3':'Row5','Col2':'Col4'].values.shape) # shape of the array


[[ 9 10 11]
 [13 14 15]
 [17 18 19]]
(3, 3)


In [47]:
# Functions for a dataframe
print(df.isnull().sum()) # checks for NULL values
print(df['Col1'].value_counts()) # count the occurrences of each unique value in a Series
print(df['Col1'].unique()) # displays unique values in a series
print(df['Col1'].nunique()) # count the number of unique values in a Series

Col1    0
Col2    0
Col3    0
Col4    0
dtype: int64


### Reading CSV files

In [51]:
df = pd.read_csv('mercedesbenz.csv')
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [52]:
# Get details about index,non-null values,the number of rows and columns, data types, and missing values.
# It does not provide statistical information about the data within the DataFrame.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [53]:
# generates descriptive statistics of numerical columns in the DataFrame, such as count, mean, std dev, min, max, and quartile values.
# It provides statistical insights into the central tendency, dispersion, and shape of the distribution of numerical data.
# It ignores non-numeric columns and does not provide information about categorical or textual data.
df.describe()

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4205.960798,100.669318,0.013305,0.0,0.075077,0.057971,0.42813,0.000475,0.002613,0.007603,...,0.318841,0.057258,0.314802,0.02067,0.009503,0.008078,0.007603,0.001663,0.000475,0.001426
std,2437.608688,12.679381,0.11459,0.0,0.263547,0.233716,0.494867,0.021796,0.051061,0.086872,...,0.466082,0.232363,0.464492,0.142294,0.097033,0.089524,0.086872,0.040752,0.021796,0.037734
min,0.0,72.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2095.0,90.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4220.0,99.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6314.0,109.01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8417.0,265.32,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [54]:
# Reading specific columns from a csv; usecols parameter
df = pd.read_csv('Test.csv', usecols=lambda x: x.upper() in ['COL2','COL3'])
print(df)
df.to_csv('Test2.csv')

   Col2  Col3
0     1     2
1     5     6
2     9    10
3    13    14
4    17    18
