In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_style("whitegrid")


In [None]:

data = pd.read_csv('../../static/data/dataset_2//matches.csv')
data.head()

In [None]:
data.describe()

In [None]:
# Check whether there are any null values present in the dataset.
data.isnull().sum()

In [None]:
# The values of umpire3 are null in almost all rows, so we are dropping the column umpire3.
data = data.iloc[:,:-1]

In [None]:
# Drop some rows containing the null values after removing the umpire3 column.
data.dropna(inplace=True)

In [None]:
# Look into the total teams listed in this dataset.
data["team1"].unique()

In [None]:
# As there old names of some teams, changing the old name to the newer one.
# for Delhi Capitals
data['team1']=data['team1'].str.replace('Delhi Daredevils','Delhi Capitals')
data['team2']=data['team2'].str.replace('Delhi Daredevils','Delhi Capitals')
data['winner']=data['winner'].str.replace('Delhi Daredevils','Delhi Capitals')
# for sunrisers Hyderabad
data['team1']=data['team1'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
data['team2']=data['team2'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
data['winner']=data['winner'].str.replace('Deccan Chargers','Sunrisers Hyderabad')

In [None]:
# Check again into the total teams listed in this dataset
data["team1"].unique()

In [None]:
# Number of IPL matches won by each team.

plt.figure(figsize = (10,6))
sns.countplot(y = 'winner',data = data,order= data['winner'].value_counts().index)
plt.xlabel('Wins')
plt.ylabel('Team')
plt.title('Number of  IPL  matches won by each team')

In [None]:
# Total number of matches played in a different stadium.

plt.figure(figsize = (10,6))
sns.countplot(y = 'venue',data = data,order = data['venue'].value_counts().iloc[:10].index)
plt.xlabel('No of matches',fontsize=12)
plt.ylabel('Venue',fontsize=12)
plt.title('Total Number of matches played in different stadium')


In [None]:
# The decision was taken by the toss winning team.

plt.figure(figsize = (10,6))
sns.countplot(x = "toss_decision", data=data)
plt.xlabel('Toss Decision',fontsize=12)
plt.ylabel('Count',fontsize=12)
plt.title('Toss Decision')

In [None]:
# Now check the unique values presented in each feature.

x = ["city", "toss_decision", "result", "dl_applied"]
for i in x:
  print("------------")
  print(data[i].unique())
  print(data[i].value_counts())

In [None]:
# We don’t need all the features or columns in order to create the model. It will reduce model accuracy, so we are dropping some of the features that don’t affect our result.

data.drop(["id", "Season","city","date", "player_of_match", 'umpire1', 'umpire2', "venue"], axis=1, inplace=True)

In [None]:
data.head()
# data['toss_decision']

In [None]:
# Dividing data into independent and dependent.

X = data.drop(["winner"], axis=1)
y = data["winner"]

In [None]:
# Converting categorical values that are present in the input data into numerical values.
X = pd.get_dummies(X, ["team1","team2", "toss_winner", "toss_decision", "result"], drop_first = True)
X.head(5)

In [None]:
# The output data is also a categorical value, converting it into numerical value.

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# Converting data into training data and testing data.

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [None]:
# Model Creation and Evaluation

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200, min_samples_split=3)

In [None]:
model.fit(x_train, y_train)

In [None]:
# Predicting the model with x_test values and saving it as y_pred.

y_pred = model.predict(x_test)

In [None]:
# Evaluating the predicted result and accuracy of the model.

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)
accuracy