In [1]:
import re
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def extract_info(commentary):
    bowler_pattern = r'([A-Za-z]+\s[A-Za-z]+) to'
    batter_pattern = r'to ([A-Za-z]+\s[A-Za-z]+)'
    ball_type_pattern = r'\b(yorker|bouncer|full toss|good length|short ball)\b'
    shot_type_pattern = r'\b(boundary|six|four|single|double|triple)\b'
    speed_pattern = r'\b(\d{2,3})\s?kph\b'
    runs_pattern = r'\b(\d+) runs?\b'
    
    bowler = re.search(bowler_pattern, commentary)
    batter = re.search(batter_pattern, commentary)
    ball_type = re.search(ball_type_pattern, commentary, re.IGNORECASE)
    shot_type = re.search(shot_type_pattern, commentary, re.IGNORECASE)
    speed = re.search(speed_pattern, commentary)
    runs = re.search(runs_pattern, commentary)
    
    return {
        "Bowler": bowler.group(1) if bowler else None,
        "Batter": batter.group(1) if batter else None,
        "Ball Type": ball_type.group(1) if ball_type else None,
        "Shot Type": shot_type.group(1) if shot_type else None,
        "Speed (kph)": speed.group(1) if speed else None,
        "Runs Scored": runs.group(1) if runs else None
    }

# Example commentary data
commentary_data = [
    "Jasprit Bumrah to Virat Kohli, good length ball, played for a single.",
    "Mitchell Starc to Rohit Sharma, yorker at 145 kph, driven for four runs!",
    "Pat Cummins to KL Rahul, short ball, pulled for a six!",
    "Rashid Khan to MS Dhoni, full toss, flicked for a double."
]

# Extract information for each commentary row
data = [extract_info(comment) for comment in commentary_data]

# Create DataFrame
df = pd.DataFrame(data)
print(df)

# Implementing TF-IDF from scratch
def compute_tf(doc):
    tf = Counter(doc.split())
    total_terms = len(doc.split())
    return {word: count / total_terms for word, count in tf.items()}

def compute_idf(docs):
    N = len(docs)
    idf = {}
    all_words = set(word for doc in docs for word in doc.split())
    for word in all_words:
        containing_docs = sum(1 for doc in docs if word in doc.split())
        idf[word] = np.log((N + 1) / (containing_docs + 1)) + 1
    return idf

def compute_tfidf(docs):
    idf = compute_idf(docs)
    tfidf = []
    for doc in docs:
        tf = compute_tf(doc)
        tfidf.append({word: tf[word] * idf[word] for word in tf})
    return tfidf

# Compute TF-IDF manually
tfidf_manual = compute_tfidf(commentary_data)
print("\nTF-IDF from scratch:")
for i, doc_tfidf in enumerate(tfidf_manual):
    print(f"Document {i+1}: {doc_tfidf}")

# Compare with Scikit-learn's TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(commentary_data)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF using Scikit-learn:")
print(tfidf_df)

           Bowler        Batter    Ball Type Shot Type Speed (kph) Runs Scored
0  Jasprit Bumrah   Virat Kohli  good length    single        None        None
1  Mitchell Starc  Rohit Sharma       yorker      four         145        None
2     Pat Cummins      KL Rahul   short ball       six        None        None
3     Rashid Khan      MS Dhoni    full toss    double        None        None

TF-IDF from scratch:
Document 1: {'Jasprit': np.float64(0.15969089432284625), 'Bumrah': np.float64(0.15969089432284625), 'to': np.float64(0.08333333333333333), 'Virat': np.float64(0.15969089432284625), 'Kohli,': np.float64(0.15969089432284625), 'good': np.float64(0.15969089432284625), 'length': np.float64(0.15969089432284625), 'ball,': np.float64(0.12590213531383254), 'played': np.float64(0.15969089432284625), 'for': np.float64(0.08333333333333333), 'a': np.float64(0.10192862927618414), 'single.': np.float64(0.15969089432284625)}
Document 2: {'Mitchell': np.float64(0.14740697937493502), 'Starc': n