In [1]:
import json
import pandas as pd
import glob

# Directory containing all JSON files
json_files = glob.glob("C:\\Users\\HP\\Downloads\\ipl_male_json (1)\\*.json")

# Initialize an empty DataFrame to store all matches' data
all_matches_df = pd.DataFrame()

# Process each JSON file
for file in json_files:
    with open(file, 'r') as f:
        data = json.load(f)

    # Extract innings data and check if there are two innings
    innings = data.get('innings', [])
    if len(innings) < 2:
        # Skip files with fewer than two innings
        continue

    total_runs = None
    match_df = pd.DataFrame()

    # First inning to get target
    first_inning = innings[0]
    total_runs = sum(delivery['runs']['total'] for over in first_inning['overs'] for delivery in over['deliveries'])
    target = total_runs + 1

    # Process the second inning (chasing)
    second_inning = innings[1]
    batting_team = second_inning['team']
    bowling_team = first_inning['team']
    
    balls_left = 120  # Assuming T20 match, 20 overs * 6 balls
    wickets_left = 10
    cumulative_runs = 0

    rows = []  # List to accumulate rows before converting to DataFrame

    for over in second_inning['overs']:
        for delivery in over['deliveries']:
            balls_left -= 1
            
            # Runs scored on this delivery
            runs_scored = delivery['runs']['total']
            cumulative_runs += runs_scored
            
            # Check for wicket
            if 'wickets' in delivery:
                wickets_left -= len(delivery['wickets'])
            
            # Calculate columns
            runs_left = target - cumulative_runs
            crr = (cumulative_runs / ((120 - balls_left) / 6)) if balls_left < 120 else 0
            rrr = (runs_left / (balls_left / 6)) if balls_left > 0 else 0
            
            # Set result to 1 or 0 based on winner; default to None if 'winner' is missing
            result = 1 if data['info'].get('outcome', {}).get('winner') == batting_team else 0

            # Append data for each ball as a dictionary in the rows list
            rows.append({
                'batting_team': batting_team,
                'bowling_team': bowling_team,
                'runs_left': runs_left,
                'balls_left': balls_left,
                'wickets_left': wickets_left,
                'total_runs': total_runs,
                'target': target,
                'crr': crr,
                'rrr': rrr,
                'result': result
            })

    # Convert rows to a DataFrame and then concatenate to match_df
    match_df = pd.DataFrame(rows)
    all_matches_df = pd.concat([all_matches_df, match_df], ignore_index=True)

# Display the first few rows of the final DataFrame
all_matches_df.head()


Unnamed: 0,batting_team,bowling_team,runs_left,balls_left,wickets_left,total_runs,target,crr,rrr,result
0,Royal Challengers Bangalore,Sunrisers Hyderabad,207,119,10,207,208,6.0,10.436975,0
1,Royal Challengers Bangalore,Sunrisers Hyderabad,207,118,10,207,208,3.0,10.525424,0
2,Royal Challengers Bangalore,Sunrisers Hyderabad,207,117,10,207,208,2.0,10.615385,0
3,Royal Challengers Bangalore,Sunrisers Hyderabad,205,116,10,207,208,4.5,10.603448,0
4,Royal Challengers Bangalore,Sunrisers Hyderabad,201,115,10,207,208,8.4,10.486957,0


In [2]:
all_matches_df.tail()

Unnamed: 0,batting_team,bowling_team,runs_left,balls_left,wickets_left,total_runs,target,crr,rrr,result
125736,Royal Challengers Bangalore,Sunrisers Hyderabad,15,0,4,208,209,9.7,0.0,0
125737,Royal Challengers Bangalore,Sunrisers Hyderabad,15,-1,3,208,209,9.619835,0.0,0
125738,Royal Challengers Bangalore,Sunrisers Hyderabad,14,-2,3,208,209,9.590164,0.0,0
125739,Royal Challengers Bangalore,Sunrisers Hyderabad,13,-3,3,208,209,9.560976,0.0,0
125740,Royal Challengers Bangalore,Sunrisers Hyderabad,9,-4,3,208,209,9.677419,0.0,0


In [3]:
all_matches_df.shape

(125741, 10)

In [4]:
json_files = glob.glob("C:\\Users\\HP\\Downloads\\ipl_male_json (1)\\*.json")
print(f"Total files found: {len(json_files)}")


Total files found: 1095


In [5]:
print(f"Total files found: {len(json_files)}")
total_rows = 0
for file in json_files:
    # Process each file
    with open(file, 'r') as f:
        data = json.load(f)
    # Skip files with fewer than two innings
    innings = data.get('innings', [])
    if len(innings) < 2:
        continue

    # Process inning data (existing code)
    match_df = pd.DataFrame(rows)
    rows_count = len(match_df)
    total_rows += rows_count
    print(f"File: {file}, Rows added: {rows_count}, Total rows: {total_rows}")
    
    # Add match data to all_matches_df
    all_matches_df = pd.concat([all_matches_df, match_df], ignore_index=True)


Total files found: 1095
File: C:\Users\HP\Downloads\ipl_male_json (1)\1082591.json, Rows added: 124, Total rows: 124
File: C:\Users\HP\Downloads\ipl_male_json (1)\1082592.json, Rows added: 124, Total rows: 248
File: C:\Users\HP\Downloads\ipl_male_json (1)\1082593.json, Rows added: 124, Total rows: 372
File: C:\Users\HP\Downloads\ipl_male_json (1)\1082594.json, Rows added: 124, Total rows: 496
File: C:\Users\HP\Downloads\ipl_male_json (1)\1082595.json, Rows added: 124, Total rows: 620
File: C:\Users\HP\Downloads\ipl_male_json (1)\1082596.json, Rows added: 124, Total rows: 744
File: C:\Users\HP\Downloads\ipl_male_json (1)\1082597.json, Rows added: 124, Total rows: 868
File: C:\Users\HP\Downloads\ipl_male_json (1)\1082598.json, Rows added: 124, Total rows: 992
File: C:\Users\HP\Downloads\ipl_male_json (1)\1082599.json, Rows added: 124, Total rows: 1116
File: C:\Users\HP\Downloads\ipl_male_json (1)\1082600.json, Rows added: 124, Total rows: 1240
File: C:\Users\HP\Downloads\ipl_male_json (1

In [6]:
print("DataFrame shape:", all_matches_df.shape)

DataFrame shape: (261149, 10)


In [7]:
all_matches_df.shape

(261149, 10)

In [8]:
all_matches_df['batting_team'].unique()

array(['Royal Challengers Bangalore', 'Rising Pune Supergiant',
       'Kolkata Knight Riders', 'Kings XI Punjab', 'Delhi Daredevils',
       'Sunrisers Hyderabad', 'Mumbai Indians', 'Gujarat Lions',
       'Chennai Super Kings', 'Rajasthan Royals', 'Delhi Capitals',
       'Punjab Kings', 'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru', 'Deccan Chargers', 'Pune Warriors',
       'Kochi Tuskers Kerala', 'Rising Pune Supergiants'], dtype=object)

In [9]:
all_matches_df['bowling_team'].unique()

array(['Sunrisers Hyderabad', 'Mumbai Indians', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Royal Challengers Bangalore',
       'Kolkata Knight Riders', 'Delhi Daredevils', 'Kings XI Punjab',
       'Rajasthan Royals', 'Chennai Super Kings', 'Delhi Capitals',
       'Punjab Kings', 'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Rising Pune Supergiants'],
      dtype=object)

In [10]:
teams = [
    'Sunrisers Hyderabad',
    'Mumbai Indians',
    'Royal Challengers Bengaluru',
    'Kolkata Knight Riders',
    'Punjab Kings',
    'Chennai Super Kings',
    'Rajasthan Royals',
    'Delhi Capitals'
]

In [11]:
all_matches_df['batting_team'] = all_matches_df['batting_team'].str.replace('Delhi Daredevils','Delhi Capitals')
all_matches_df['bowling_team'] = all_matches_df['bowling_team'].str.replace('Delhi Daredevils','Delhi Capitals')

all_matches_df['batting_team'] = all_matches_df['batting_team'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
all_matches_df['bowling_team'] = all_matches_df['bowling_team'].str.replace('Deccan Chargers','Sunrisers Hyderabad')


all_matches_df['batting_team'] = all_matches_df['batting_team'].str.replace('Kings XI Punjab','Punjab Kings')
all_matches_df['bowling_team'] = all_matches_df['bowling_team'].str.replace('Kings XI Punjab','Punjab Kings')


all_matches_df['batting_team'] = all_matches_df['batting_team'].str.replace('Royal Challengers Bangalore','Royal Challengers Bengaluru')
all_matches_df['bowling_team'] = all_matches_df['bowling_team'].str.replace('Royal Challengers Bangalore','Royal Challengers Bengaluru')


In [15]:
all_matches_df = all_matches_df[all_matches_df['batting_team'].isin(teams)]
all_matches_df = all_matches_df[all_matches_df['bowling_team'].isin(teams)]

In [16]:
all_matches_df.shape

(238432, 10)

In [17]:
all_matches_df['batting_team'].unique()

array(['Royal Challengers Bengaluru', 'Delhi Capitals', 'Mumbai Indians',
       'Punjab Kings', 'Kolkata Knight Riders', 'Sunrisers Hyderabad',
       'Chennai Super Kings', 'Rajasthan Royals'], dtype=object)

In [18]:
all_matches_df['result'].unique()

array([0, 1], dtype=int64)

In [19]:
final_df = all_matches_df[['batting_team','bowling_team','runs_left','balls_left','wickets_left','target','crr','rrr','result']]

In [20]:
final_df.head()

Unnamed: 0,batting_team,bowling_team,runs_left,balls_left,wickets_left,target,crr,rrr,result
0,Royal Challengers Bengaluru,Sunrisers Hyderabad,207,119,10,208,6.0,10.436975,0
1,Royal Challengers Bengaluru,Sunrisers Hyderabad,207,118,10,208,3.0,10.525424,0
2,Royal Challengers Bengaluru,Sunrisers Hyderabad,207,117,10,208,2.0,10.615385,0
3,Royal Challengers Bengaluru,Sunrisers Hyderabad,205,116,10,208,4.5,10.603448,0
4,Royal Challengers Bengaluru,Sunrisers Hyderabad,201,115,10,208,8.4,10.486957,0


In [21]:
final_df.shape

(238432, 9)

In [22]:
final_df = final_df.sample(final_df.shape[0])

In [23]:
final_df.sample()

Unnamed: 0,batting_team,bowling_team,runs_left,balls_left,wickets_left,target,crr,rrr,result
219652,Royal Challengers Bengaluru,Sunrisers Hyderabad,141,76,10,209,9.272727,11.131579,0


In [24]:
final_df.dropna(inplace=True)

In [25]:
final_df.shape

(238432, 9)

In [26]:
final_df = final_df[final_df['balls_left'] != 0]

In [27]:
final_df.shape

(236839, 9)

In [28]:
X = final_df.iloc[:,:-1]
y = final_df.iloc[:,-1]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [29]:
X_train

Unnamed: 0,batting_team,bowling_team,runs_left,balls_left,wickets_left,target,crr,rrr
137498,Royal Challengers Bengaluru,Sunrisers Hyderabad,45,18,6,209,9.647059,15.000000
241524,Royal Challengers Bengaluru,Sunrisers Hyderabad,57,28,7,209,9.913043,12.214286
32848,Mumbai Indians,Chennai Super Kings,73,45,6,157,6.720000,9.733333
105503,Chennai Super Kings,Mumbai Indians,145,108,7,149,2.000000,8.055556
59928,Kolkata Knight Riders,Sunrisers Hyderabad,63,90,9,114,10.200000,4.200000
...,...,...,...,...,...,...,...,...
29852,Sunrisers Hyderabad,Delhi Capitals,52,29,6,160,7.120879,10.758621
202777,Royal Challengers Bengaluru,Sunrisers Hyderabad,154,87,10,209,10.000000,10.620690
225576,Royal Challengers Bengaluru,Sunrisers Hyderabad,190,104,10,209,7.125000,10.961538
20661,Punjab Kings,Delhi Capitals,74,35,5,158,5.929412,12.685714


In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team'])
]
,remainder='passthrough')

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [36]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',LogisticRegression(solver='liblinear'))
])

In [37]:
pipe.fit(X_train,y_train)



In [38]:
y_pred = pipe.predict(X_test)

In [39]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9003968924168215

In [73]:
pipe.predict_proba(X_test)[11200]

array([0.0602164, 0.9397836])

In [74]:
teams

['Sunrisers Hyderabad',
 'Mumbai Indians',
 'Royal Challengers Bengaluru',
 'Kolkata Knight Riders',
 'Punjab Kings',
 'Chennai Super Kings',
 'Rajasthan Royals',
 'Delhi Capitals']

In [75]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

In [78]:
def find_closest_scenarios(final_df, input_data, num_scenarios=5):
    # Extract input features
    runs_left, balls_left, wickets_left, crr, rrr = input_data

    # Calculate distance from each row in DataFrame to the input data
    final_df['distance'] = np.sqrt(
        (final_df['runs_left'] - runs_left)**2 +
        (final_df['balls_left'] - balls_left)**2 +
        (final_df['wickets_left'] - wickets_left)**2 +
        (final_df['crr'] - crr)**2 +
        (final_df['rrr'] - rrr)**2
    )

    # Sort by distance and select the closest scenarios
    closest_scenarios_df = all_matches_df.nsmallest(num_scenarios, 'distance')
    
    # Convert the results to a list of dictionaries
    closest_scenarios = closest_scenarios_df.to_dict(orient='records')
    
    # If fewer than num_scenarios are found, repeat entries or add placeholders
    while len(closest_scenarios) < num_scenarios:
        closest_scenarios.append({
            'batting_team': 'N/A',
            'bowling_team': 'N/A',
            'runs_left': None,
            'balls_left': None,
            'wickets_left': None,
            'total_runs': None,
            'target': None,
            'crr': None,
            'rrr': None,
            'result': None,
            'distance': None
        })

    return closest_scenarios


In [96]:
import numpy as np

def find_closest_scenarios(final_df, input_data, num_scenarios=5):
    # Extract input features
    runs_left, balls_left, wickets_left, crr, rrr = input_data

    # Calculate distance from each row in DataFrame to the input data
    final_df['distance'] = np.sqrt(
        (final_df['runs_left'] - runs_left)**2 +
        (final_df['balls_left'] - balls_left)**2 +
        (final_df['wickets_left'] - wickets_left)**2 +
        (final_df['crr'] - crr)**2 +
        (final_df['rrr'] - rrr)**2
    )

    # Sort by distance and select the closest scenarios
    closest_scenarios_df = final_df.nsmallest(num_scenarios, 'distance')

    # Format each scenario for output
    output = []
    for i, row in closest_scenarios_df.iterrows():
        scenario = (f"{row.get('batting_team', 'N/A')}, {row.get('bowling_team', 'N/A')}, "
                    f"Team Batting First: {row.get('batting_team', 'N/A')},\n"
                    f"Second Innings \n"
                    f"Run Scored: {row.get('total_runs', 'N/A')}, Wicket Fallen: {10 - row['wickets_left']}, "
                    f"Overs Completed: {20 - (row['balls_left'] // 6)}, "
                    f"Runs still required to win: {row['runs_left']},\n"
                    f"Winning Team: {row.get('batting_team', 'N/A') if row['result'] == 1 else row.get('bowling_team', 'N/A')}"
                    f"\n")
        output.append(scenario)

    # Ensure there are at least `num_scenarios` entries by adding placeholders if needed
    while len(output) < num_scenarios:
        output.append("Scenario N/A: Placeholder data")

    return "\n".join(output)

# Test input
input_data = [52, 30, 4, 7.1, 8.4]

# Call the function to find closest scenarios
formatted_output = find_closest_scenarios(all_matches_df, input_data, num_scenarios=5)

print(formatted_output)


Delhi Capitals, Mumbai Indians, Team Batting First: Delhi Capitals,
Second Innings 
Run Scored: 162, Wicket Fallen: 5, Overs Completed: 15, Runs still required to win: 52,
Winning Team: Mumbai Indians

Delhi Capitals, Chennai Super Kings, Team Batting First: Delhi Capitals,
Second Innings 
Run Scored: 150, Wicket Fallen: 5, Overs Completed: 15, Runs still required to win: 52,
Winning Team: Chennai Super Kings

Punjab Kings, Sunrisers Hyderabad, Team Batting First: Punjab Kings,
Second Innings 
Run Scored: 168, Wicket Fallen: 5, Overs Completed: 15, Runs still required to win: 52,
Winning Team: Punjab Kings

Rajasthan Royals, Delhi Capitals, Team Batting First: Rajasthan Royals,
Second Innings 
Run Scored: 143, Wicket Fallen: 5, Overs Completed: 15, Runs still required to win: 52,
Winning Team: Rajasthan Royals

Punjab Kings, Royal Challengers Bengaluru, Team Batting First: Punjab Kings,
Second Innings 
Run Scored: 174, Wicket Fallen: 7, Overs Completed: 15, Runs still required to win: 

In [97]:
import pandas as pd

# Assuming all_matches_df is your DataFrame
all_matches_df.to_csv('all_matches.csv', index=False)
