In [1]:
"""
This notebook demonstrates data analysis using pandas and numpy libraries.

Libraries imported:
- pandas: A powerful data manipulation and analysis library
- numpy: A fundamental package for scientific computing with Python

These libraries work together to provide:
- Efficient data structures for tabular data (pandas DataFrame)
- Mathematical operations and array manipulation (numpy arrays)
- Statistical analysis and data transformation capabilities
"""

import pandas as pd
import numpy as np # type: ignore


In [14]:
# Create a numpy array of size 3, filled with ones (1's) and integer type
my_array = np.ones(3, dtype=([('foo', int), ('bar', float)]))
print(my_array['foo'])
print(my_array)

In [16]:
# Create a 1D numpy array of size 3, filled with ones (1's)
# dtype=(int) specifies that all elements should be integers
my_array = np.ones(3, dtype=(int))

# Print the array - this will show: [1 1 1]
print(my_array)   

[1 1 1]


In [4]:
# Convert the array to a record array view (np.recarray)
# This allows accessing fields using dot notation instead of dictionary-style notation
my_array2 = my_array.view(np.recarray)

# Access the 'foo' field using dot notation (my_array2.foo)
# This is equivalent to my_array['foo'] but with a more convenient syntax
print(my_array2.foo)

[1 1 1]


In [20]:
# Create a structured array of size 3, filled with ones
# The array has two named fields:
#   - 'foo': integer type
#   - 'bar': floating-point type
my_array = np.ones(3, dtype=([('foo', int), ('bar', float)]))

# Print the integer field 'foo' values
print(my_array['foo'])

# Print the floating-point field 'bar' values
print(my_array['bar'])


In [23]:
# Convert the structured array to a record array view
# This allows attribute-style access to fields (using dot notation)
my_array2 = my_array.view(np.recarray)

# Print the entire record array with all its fields
print(my_array2)

# Print just the 'foo' field values using dot notation
# This is more intuitive than dictionary-style access my_array['foo']
print(my_array2.foo)


In [27]:
# Create a 2D numpy array with headers and data
# Structure:
#   - First row: Empty cell, followed by column headers (Col1, Col2)
#   - Following rows: Row labels in first column, followed by data values
data = np.array([['','Col1','Col2'],      # Headers row: empty, Col1, Col2
                ['Row1',1,2],             # First data row: Row1 label, values 1,2
                ['Row2',3,4]])            # Second data row: Row2 label, values 3,4

# Print the entire array including headers and data
print(data)

In [30]:
# Create a pandas DataFrame from the numpy array
# This transforms the 2D array into a structured table with labeled rows and columns
pd.DataFrame(
    # Extract data values only (exclude row labels and column headers)
    # data[1:, 1:] means: all rows except first, all columns except first
    data=data[1:, 1:], 
    
    # Set row labels (index) using the first column values (Row1, Row2)
    # data[1:, 0] means: all rows except first, only the first column
    index=data[1:, 0], 
    
    # Set column names using the first row values (Col1, Col2)
    # data[0, 1:] means: only the first row, all columns except first
    columns=data[0, 1:] 
)

Unnamed: 0,Col1,Col2
Row1,1,2
Row2,3,4


In [34]:
df = pd.read_excel("Data.xlsx")
print(df.head())
print(df.shape)
print(df.info())

In [41]:
print(df.describe)


<bound method NDFrame.describe of     Unnamed: 0           Unnamed: 1              Unnamed: 2  Unnamed: 3  \
0          NaN                 DATE                    NAME    DESCRIBE   
1          NaN  2024-02-01 00:00:00                Merhawit  Healthcare   
2          NaN  2024-02-01 00:00:00          Abiel G/Meskel    Forklift   
3          NaN  2024-02-01 00:00:00          Adhanom Tesfay    Computer   
4          NaN  2024-02-01 00:00:00            Bahlbi Guesh    Forklift   
5          NaN  2024-02-01 00:00:00        Natnael Yohannes   Excavator   
6          NaN  2024-02-01 00:00:00           Yosab solomon   Excavator   
7          NaN  2024-02-01 00:00:00            Enginer Fred     Expense   
8          NaN  2024-02-01 00:00:00        Water for office     Expense   
9          NaN  2024-02-01 00:00:00     Awet wood transport     Expense   
10         NaN  2024-03-01 00:00:00  Muyenga site transport     Expense   

   Unnamed: 4 Unnamed: 5  
0       DEBIT     CREDIT  
1      3700

In [42]:
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))

# Use the `shape` property
print(df.shape)

# Or use the `len()` function with the `index` property
print(len(df))

(2, 3)
2


In [43]:
# Take a 2D array as input to your DataFrame 
my_2darray = np.array([[1, 2, 3], [4, 5, 6]])
print(pd.DataFrame(my_2darray))

# Take a dictionary as input to your DataFrame 
my_dict = {1: ['1', '3'], 2: ['1', '2'], 3: ['2', '4']}
print(pd.DataFrame(my_dict))

# Take a DataFrame as input to your DataFrame 
my_df = pd.DataFrame(data=[4,5,6,7], index=range(0,4), columns=['A'])
print(pd.DataFrame(my_df))

# Take a Series as input to your DataFrame
my_series = pd.Series({"Belgium":"Brussels", "India":"New Delhi", "United Kingdom":"London", "United States":"Washington"})
print(pd.DataFrame(my_series))

   0  1  2
0  1  2  3
1  4  5  6
   1  2  3
0  1  1  2
1  3  2  4
   A
0  4
1  5
2  6
3  7
                         0
Belgium           Brussels
India            New Delhi
United Kingdom      London
United States   Washington
