<a href="https://colab.research.google.com/github/avijitdeb2023/Data-Science/blob/main/Numpy_and_Pandas_for_Data_Science.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install required packages (run this first in Google Colab)
!pip install numpy pandas matplotlib

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)

NumPy version: 2.0.2
Pandas version: 2.2.2


# **numpy**

In [None]:
# Creating NumPy arrays
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.array([[1, 2, 3], [4, 5, 6]])

print("1D Array:", arr1)
print("2D Array:\n", arr2)
print("Array1 shape:", arr1.shape)
print("Array1 dtype:", arr1.dtype)
print("Array2 shape:", arr2.shape)
print("Array2 dtype:", arr2.dtype)

1D Array: [1 2 3 4 5]
2D Array:
 [[1 2 3]
 [4 5 6]]
Array1 shape: (5,)
Array1 dtype: int64
Array2 shape: (2, 3)
Array2 dtype: int64


In [None]:
# Different ways to create arrays
zeros_array = np.zeros((3, 4))
ones_array = np.ones((2, 3))
range_array = np.arange(0, 10, 2)
linspace_array = np.linspace(0, 1, 4)

print("Zeros array:\n", zeros_array)
print("Ones array:\n", ones_array)
print("Range array:", range_array)
print("Linspace array:", linspace_array)

Zeros array:
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Ones array:
 [[1. 1. 1.]
 [1. 1. 1.]]
Range array: [0 2 4 6 8]
Linspace array: [0.         0.33333333 0.66666667 1.        ]


In [2]:
# Mathematical operations on arrays
a = np.array([1, 2, 3, 4, 5])
b = np.array([2, 3, 4, 5, 6])

# Element-wise operations
print("Addition:", a + b)
print("Multiplication:", a * b)
print("Power:", a ** 2)
print("Square root:", np.sqrt(a))
print("Square root:", np.sqrt(b))


Addition: [ 3  5  7  9 11]
Multiplication: [ 2  6 12 20 30]
Power: [ 1  4  9 16 25]
Square root: [1.         1.41421356 1.73205081 2.         2.23606798]
Square root: [1.41421356 1.73205081 2.         2.23606798 2.44948974]


In [None]:
# Statistical operations
print("Mean:", np.mean(a))
print("Standard deviation:", np.std(a))
print("Min:", np.min(a))
print("Max:", np.max(a))


Mean: 3.0
Standard deviation: 1.4142135623730951
Min: 1
Max: 5


In [None]:
# Broadcasting example
matrix = np.array([[1, 2, 3], [4, 5, 6]])
vector = np.array([10, 20, 30])
result = matrix + vector
print("Broadcasting result:\n", result)

Broadcasting result:
 [[11 22 33]
 [14 25 36]]


In [None]:
arr = np.array([10, 20, 30, 40, 50])

# Indexing
print("Element at index 2:", arr[2])

# Slicing
print("Sliced array (from index 1 to 3):", arr[1:4])

# 2D array slicing
arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print("Sliced 2D array (first two rows):\n", arr_2d[:2])
print("Sliced 2D array (first two columns):\n", arr_2d[:,:2])

Element at index 2: 30
Sliced array (from index 1 to 3): [20 30 40]
Sliced 2D array (first two rows):
 [[1 2 3]
 [4 5 6]]
Sliced 2D array (first two columns):
 [[1 2]
 [4 5]
 [7 8]]


# **pandas**

In [None]:
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [24, 27, 22],
        'City': ['New York', 'Los Angeles', 'Chicago']}

df = pd.DataFrame(data)
print("DataFrame:\n", df)

DataFrame:
       Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago


In [None]:
# Basic DataFrame information
print("\nDataFrame Info:")
print(df.info())



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   City    3 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes
None


In [None]:
# Basic DataFrame information
print("\nDataFrame Description:")
print(df.describe())



DataFrame Description:
             Age
count   3.000000
mean   24.333333
std     2.516611
min    22.000000
25%    23.000000
50%    24.000000
75%    25.500000
max    27.000000


In [None]:
# Basic DataFrame information

print("\nDataFrame Shape:", df.shape)


DataFrame Shape: (3, 3)


In [None]:
# Basic DataFrame information

print("Column names:", df.columns.tolist())


Column names: ['Name', 'Age', 'City']


In [None]:
# Basic DataFrame information

# Display first 5 rows
print("First 5 rows of the dataset:")
print(df.head())
# Display last 5 rows
print("\nLast 5 rows of the dataset:")
print(df.tail())


First 5 rows of the dataset:
      Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago

Last 5 rows of the dataset:
      Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago


In [None]:
# Basic DataFrame information

# Check data types
print("\nData Types:")
print(df.dtypes)


Data Types:
Name    object
Age      int64
City    object
dtype: object


In [None]:
# Accessing a single column
print("Names column:\n", df['Name'])

# Accessing multiple columns
print("\n Name and Age columns:\n", df[['Name', 'Age']])


Names column:
 0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object

 Name and Age columns:
       Name  Age
0    Alice   24
1      Bob   27
2  Charlie   22


In [None]:
# Convert NumPy array to DataFrame
np_array = np.random.randn(5, 3)
df_from_array = pd.DataFrame(np_array, columns=['A', 'B', 'C'])
print("DataFrame from NumPy array:")
print(df_from_array)

DataFrame from NumPy array:
          A         B         C
0 -0.792835 -0.119421  1.310956
1 -0.163471  0.259977 -0.401255
2 -1.241731  0.276178 -0.372301
3  0.490316 -0.301418  1.211650
4 -0.881688  0.484441 -0.406529


In [None]:
# Create sample sales data
sales_data = {
    'Region': ['North', 'South', 'East', 'West', 'North', 'South', 'East', 'West'],
    'Product': ['A', 'A', 'B', 'B', 'A', 'B', 'A', 'B'],
    'Sales': [100, 150, 200, 120, 110, 180, 190, 140],
    'Quantity': [10, 15, 20, 12, 11, 18, 19, 14]
}
sales_df = pd.DataFrame(sales_data)
print("Sales DataFrame:")
print(sales_df)

Sales DataFrame:
  Region Product  Sales  Quantity
0  North       A    100        10
1  South       A    150        15
2   East       B    200        20
3   West       B    120        12
4  North       A    110        11
5  South       B    180        18
6   East       A    190        19
7   West       B    140        14


In [None]:
# Group by operations
region_sales = sales_df.groupby('Region')['Sales'].sum()
print("\nSales by Region:")
print(region_sales)



Sales by Region:
Region
East     390
North    210
South    330
West     260
Name: Sales, dtype: int64


In [6]:
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
file_path = '/content/drive/MyDrive/Data Science and ML/diabetes dataset.csv'
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(file_path, names=columns)  # No header in raw CSV
print("Dataset loaded successfully. Shape:", df.shape)

Dataset loaded successfully. Shape: (769, 9)


In [11]:
data_array = df.values
print("NumPy array shape:", data_array.shape)
print("Data type:", data_array.dtype)


NumPy array shape: (769, 9)
Data type: object


In [12]:
# Quick overview
print(df.head())
print(df.tail)

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0  Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI   
1            6      148             72             35        0  33.6   
2            1       85             66             29        0  26.6   
3            8      183             64              0        0  23.3   
4            1       89             66             23       94  28.1   

   DiabetesPedigreeFunction  Age  Outcome  
0  DiabetesPedigreeFunction  Age  Outcome  
1                     0.627   50        1  
2                     0.351   31        0  
3                     0.672   32        1  
4                     0.167   21        0  
<bound method NDFrame.tail of      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI   
1              6      148             72             35        0  33.6   
2              1       85             66           

In [13]:
print("\nDataset info:")
print(df.info())



Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769 entries, 0 to 768
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Pregnancies               769 non-null    object
 1   Glucose                   769 non-null    object
 2   BloodPressure             769 non-null    object
 3   SkinThickness             769 non-null    object
 4   Insulin                   769 non-null    object
 5   BMI                       769 non-null    object
 6   DiabetesPedigreeFunction  769 non-null    object
 7   Age                       769 non-null    object
 8   Outcome                   769 non-null    object
dtypes: object(9)
memory usage: 54.2+ KB
None


In [14]:
# Basic statistics
print("\nDescriptive statistics:")
print(df.describe())


Descriptive statistics:
       Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI  \
count          769     769           769           769     769  769   
unique          18     137            48            52     187  249   
top              1     100            70             0       0   32   
freq           135      17            57           227     374   13   

       DiabetesPedigreeFunction  Age Outcome  
count                       769  769     769  
unique                      518   53       3  
top                       0.258   22       0  
freq                          6   72     500  


In [15]:
#missing values
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
