In [None]:
# MATHEMATICAL COMPUTING WITH PYTHON NUMPY

In [2]:
# NUMPY ARRAYS

import numpy as np    # importing module

array = np.array([1, 2, 3, 4, 5])    # Creating a NumPy array
print(array)  # Output: [1 2 3 4 5]


[1 2 3 4 5]


In [26]:
# DATA TYPES
arr = np.array([1, 2, 3], dtype='float32')
print(arr.dtype)  # Output: float32


#Common Data Types:
        # int32, int64: Integer types
        # float32, float64: Floating-point types
        # bool: Boolean type


float32


In [22]:
# ARRAY CREATION

import numpy as np

# Zero array
zeros = np.zeros((2, 3))  # 2x3 array of zeros

# One array
ones = np.ones((3, 3))    # 3x3 array of ones

# Identity matrix
identity = np.eye(3)      # 3x3 identity matrix

# Random array
random_array = np.random.rand(2, 2)

print(zeros)
print()
print(ones)
print()
print(identity)
print()
print(random_array)

[[0. 0. 0.]
 [0. 0. 0.]]

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]

[[0.03123073 0.72113665]
 [0.66961511 0.04346165]]


In [16]:
array = np.array([[1, 2, 3], [4, 5, 6]])
print(array[0, 1])      # Access element (row 0, col 1): 2
print(array[:, 1])      # Access entire 2nd column: [2, 5]
print(array[1, :])      # Access entire 2nd row: [4, 5, 6]


2
[2 5]
[4 5 6]


In [5]:
# DATA PROCESSING USING ARRAYS


# Step 1: Create a large dataset
data = np.random.randint(0, 100, size=(10,))  # Array of 1000 random integers between 0 and 100
# Step 2: Perform an operation (sorting the data)
sorted_data = np.sort(data)  # Sort the data in ascending order
# Step 3: Output the result
print(f'Sorted Data: {sorted_data}')




# 1. ELEMENT-WISE OPERATIONS
doubled_data = data * 2           # Element-wise operation: Multiply every element by 2
print(f'Doubled Data: {doubled_data}')

# 2. AGGREGATE FUNCTIONS
total = np.sum(data)              # Calculate the sum of all elements
print(f'Sum of Data: {total}')
mean_value = np.mean(data)        # Calculate the mean of all elements
print(f'Mean value: {mean_value}')
std_dev = np.std(data)            # Calculate the standard deviation
print(f'Standard Deviation: {std_dev}')

# 3. FILTERING 
filtered_data = data[data > 50]   # Select all values greater than 50
print(f'Filtered Data: {filtered_data}')

# 4. RESHAPING DATA
reshaped_data = data.reshape(5, 2) # 2D array (10 rows, 100 columns)
print(f'Reshaped Data: {reshaped_data}')

# 5. VECTORIZED OPERATIONS
data2 = np.random.randint(0, 100, size=(10,))
sum_data = data + data2          # Add two arrays element-wise
print(f'Sum of 2 Datasets: {sum_data}')

# 6. STATISTICAL FUNCTIONS
median_value = np.median(data)   # Median of the data
print(f'Median value: {median_value}')
variance = np.var(data)          # Variance of the data
print(f'Variance: {variance}')


Sorted Data: [18 26 29 35 39 52 73 76 79 88]
Doubled Data: [ 78 146 176  70  58  36  52 158 152 104]
Sum of Data: 515
Mean value: 51.5
Standard Deviation: 24.20433845408711
Filtered Data: [73 88 79 76 52]
Reshaped Data: [[39 73]
 [88 35]
 [29 18]
 [26 79]
 [76 52]]
Sum of 2 Datasets: [ 74 116 117  98 128  64 108 120  84  83]
Median value: 45.5
Variance: 585.85


In [20]:
# LOADING & SAVING DATA
np.save('data.npy', array)          # loading
loaded_array = np.load('data.npy')  # saving
print(loaded_array)

[[1 2 3]
 [4 5 6]]


In [21]:
# NUMPY RANDOM NUMBERS
random_numbers = np.random.rand(5)  # Array of 5 random numbers
normal_dist = np.random.normal(size=5)  # Normal distribution
print(random_numbers)
print(normal_dist)

[0.8740059  0.96401902 0.26795499 0.34509111 0.35706917]
[0.45546281 0.48021967 1.6891799  0.65397695 0.88885102]


In [None]:
# DATA MANIPULATION WITH PANDAS

In [2]:
# DATA WRANGLING - Data wrangling refers to the process of cleaning, transforming, and structuring data for analysis.

import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35]}   
df = pd.DataFrame(data)   # Sample data dictionary
print(df)                 # View the DataFrame


# The pd.DataFrame(data) creates the DataFrame and the print(df) outputs the content.

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [3]:
# DATA EXPLORATION  - Exploring data is essential to understand its structure, types, and quality.

print(df.head())       # View the first 5 rows of the DataFrame
print(df.info())       # Information about the DataFrame (data types, non-null counts)
print(df.describe())   # Summary statistics of numeric columns
print(df.tail())       # View the last 5 rows of the DataFrame

# head() shows the first 5 rows of the DataFrame.
# info() provides an overview of the DataFrame, including column names, number of non-null values, and data types.
# describe() summarizes the statistics (like mean, standard deviation, etc.) of numeric columns.
# tail() return the last 5 rows by default.

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 180.0+ bytes
None
        Age
count   3.0
mean   30.0
std     5.0
min    25.0
25%    27.5
50%    30.0
75%    32.5
max    35.0
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [31]:
# CLEANING DATA - Data often comes with missing values, duplicates, or other issues that need cleaning.

df.dropna(inplace=True)             # Removing null values, Drop rows with missing values
df.fillna(0,inplace=True)           # Replacing null values with default value like 0
df.drop_duplicates(inplace=True)    # Drop duplicate rows
print(df)

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [32]:
# FILTERING DATA  - Filtering allows you to select data based on conditions or criteria.
filtered = df[df['Age'] > 25]   # Filter rows where Age is greater than 25
print(filtered)


      Name  Age
1      Bob   30
2  Charlie   35


In [33]:
# MERGING DATA  - combine two or more datasets using various types of joins (like inner, outer, etc.).

# Sample data
df1 = pd.DataFrame({'Key': [1, 2, 3], 'Value1': ['A', 'B', 'C']})
df2 = pd.DataFrame({'Key': [2, 3, 4], 'Value2': ['X', 'Y', 'Z']})
# Merge on 'Key' column
merged=pd.merge(df1,df2,on='Key',how='inner')
print(merged)


# We merged df1 and df2 on the Key column. The how='inner' argument specifies an inner join, where only rows with matching keys are included in the result.



   Key Value1 Value2
0    2      B      X
1    3      C      Y


In [7]:
# RESHAPING DATA - Reshaping changes the structure of data, such as pivoting or stacking/unstacking.

# Sample data
data = {'Row': ['A', 'A', 'B', 'B'], 'Column': ['X', 'Y', 'X', 'Y'], 'Value': [10, 20, 30, 40]}
df = pd.DataFrame(data)
# Pivoting the data to create a table
pivot=df.pivot_table(values='Value',index='Row',columns='Column')
print(pivot)

# we used pivot_table to reshape the data. The Row becomes the index, Column becomes the columns, and Value becomes the data inside the table.

Column     X     Y
Row               
A       10.0  20.0
B       30.0  40.0


In [2]:
# DATA AGGREGATION - Aggregation is used to summarize data by grouping and applying functions
import pandas as pd
# Sample data
data = {'Category': ['A', 'A', 'B', 'B'], 'Value': [10, 20, 30, 40]}
df = pd.DataFrame(data)
# Grouping by Category and calculating the mean
grouped = df.groupby('Category').mean()
print(grouped)


          Value
Category       
A          15.0
B          35.0


In [None]:
# READING AND WRITING FILES  -  Pandas provides functions to read and write data in various formats like CSV, Excel, etc.

df = pd.read_csv('data.csv')           # Reading from a CSV file
print(df)
df.to_csv('output.csv', index=False)   # Writing to a CSV file
print(df)

# pd.read_csv() reads data from a CSV file and returns it as a DataFrame.
# to_csv() writes the DataFrame to a CSV file.
# The argument index=False ensures that the DataFrame's index is not written to the file.
