# Creating DataFrames
Let’s look at different ways to create a Pandas DataFrame — the core data structure you’ll be using 90% of the time in data science.



# From Python Lists


In [3]:
import pandas as pd 
data = [
    ["harry",34],["jack",67],["tak",27],["mank",64],["perk",87]
]

In [4]:
data


[['harry', 34], ['jack', 67], ['tak', 27], ['mank', 64], ['perk', 87]]

In [5]:
pd.DataFrame(data,columns=["Name","Marks"])

Unnamed: 0,Name,Marks
0,harry,34
1,jack,67
2,tak,27
3,mank,64
4,perk,87


In [6]:
data = {"a":[1,2,3],"b":[4,5,6]}
data

{'a': [1, 2, 3], 'b': [4, 5, 6]}

In [7]:
df=pd.DataFrame(data)
df

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [8]:
import numpy as np
arr = np.array([[1,2,3],[4,5,6]])
df = pd.DataFrame(arr,columns=["A","B","C"])

In [9]:
df

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6


# From Excel Files

In [11]:
df = pd.read_excel(r"C:\Users\Anshika\OneDrive\Desktop\Data-Science-Journey-2026\data.xlsx.xlsx")
df

Unnamed: 0,NAME,SCHOOL,MARKS
0,anshi,DPS,56
1,anshika,DPS,78
2,avani,LLS,89
3,priya,LLS,89
4,aliza,RDS,45
5,sanya,RDS,56


# From CSV Files

In [13]:
df = pd.read_csv(r"C:\Users\Anshika\OneDrive\Desktop\Data-Science-Journey-2026\data.csv")
df


Unnamed: 0,NAME,SCHOOL,MARKS
0,anshi,DPS,56
1,anshika,DPS,78
2,avani,LLS,89
3,priya,LLS,89
4,aliza,RDS,45
5,sanya,RDS,56


# From JSON

In [15]:
df = pd.read_json(r"C:\Users\Anshika\OneDrive\Desktop\Data-Science-Journey-2026\newdata.json")
df

Unnamed: 0,name,age
0,Anshika,20
1,Babita,22


# From the Web (Example: CSV from URL)

In [30]:
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv")
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


# EDA (Exploratory Data Analysis)
Exploratory Data Analysis (EDA) is an essential first step in any data science project.

It involves taking a deep look at the dataset to understand its structure, spot patterns, identify anomalies, and uncover relationships between variables. This process includes generating summary statistics, checking for missing or duplicate data, and creating visualizations like histograms, box plots, and scatter plots. The goal of EDA is to get a clear picture of what the data is telling you before applying any analysis or machine learning models.

By exploring the data thoroughly, you can make better decisions about how to clean, transform, and model it effectively.

Once your DataFrame is ready, run these to understand your data:

In [33]:
df.head()# First 5 rows

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [37]:
df.tail()         # Last 5 rows

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [39]:
df.shape  

(244, 7)

In [41]:
df.columns        # List of column names

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [43]:
df.describe()     # Statistical for numeric columns

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [45]:
df.info()         # Column info: types, non-nulls

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB
