<a href="https://colab.research.google.com/github/auro005/ML/blob/main/hangman_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
infile = open('words_250000_train.txt','r')
corpus = []
for line in infile:
  #print(line,len(line[:-1]))

  corpus.append(line[:-1])
infile.close()

In [10]:
len(corpus)

227300

In [52]:
short_corpus = corpus[:10000]

In [53]:
import random

def generate_word_patterns(word):
    """Generate different masked versions of a word for training."""
    patterns = []
    word_length = len(word)

    # Create 3 random masking patterns for training data
    for _ in range(word_length):
        mask = list(word)
        mask_indices = random.sample(range(word_length), k=random.randint(1, word_length - 1))
        for idx in mask_indices:
            mask[idx] = '_'
        patterns.append(" ".join(mask))  # Add spaces between letters
    return patterns

# Generate patterns for the corpus
training_patterns = []
for word in short_corpus:
    patterns = generate_word_patterns(word)
    for pattern in patterns:
        training_patterns.append((pattern, word))  # (masked pattern, full word)


In [54]:
training_patterns[:5]

[('a a _', 'aaa'),
 ('a _ a', 'aaa'),
 ('_ a a', 'aaa'),
 ('_ a _ _ _ _', 'aaaaaa'),
 ('a a a a a _', 'aaaaaa')]

In [55]:
from collections import Counter

def extract_features(pattern, corpus):
    """Extract features from the masked pattern."""
    features = {}

    # Visible letters and their positions
    letters = [char for char in pattern if char.isalpha()]
    features['visible_letters'] = "".join(letters)
    features['word_length'] = len(pattern.replace(" ", ""))

    # Frequency of visible letters in the corpus
    ## create a single string that concat all the words then find the frequency of each letter
    all_letters = "".join(corpus)
    letter_freq = Counter(all_letters)
    features['letter_freq_sum'] = sum(letter_freq[letter] for letter in letters if letter in letter_freq)

    return features

# Example usage
features = [extract_features(pattern, short_corpus) for pattern, word in training_patterns]


In [67]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

def extract_features(pattern, corpus):
    """Extract numeric features from the masked pattern."""
    features = {}

    # Visible letters
    letters = [char for char in pattern if char.isalpha()]
    features['visible_letters'] = "".join(letters)
    features['word_length'] = len(pattern.replace(" ", ""))

    # Frequency of visible letters in the corpus
    all_letters = "".join(corpus)
    letter_freq = Counter(all_letters)
    features['letter_freq_sum'] = sum(letter_freq[letter] for letter in letters if letter in letter_freq)

    return features


from sklearn.feature_extraction.text import CountVectorizer

# Extract `visible_letters` as text
visible_letters_list = [features['visible_letters'] for features, label in final_data]

# One-hot encode visible letters using CountVectorizer
vectorizer = CountVectorizer(analyzer='char')  # Treat each character as a feature
letter_features = vectorizer.fit_transform(visible_letters_list).toarray()

# Convert remaining features to numeric
numeric_features = [
    [features['word_length'], features['letter_freq_sum']]
    for features, label in final_data
]

# Combine one-hot encoded features with numeric features
X = np.hstack([letter_features, numeric_features])

# Labels remain the same
y = [label for features, label in final_data]


In [68]:
X

array([[    2,     0,     0, ...,     0,     3, 33444],
       [    2,     0,     0, ...,     0,     3, 33444],
       [    2,     0,     0, ...,     0,     3, 33444],
       ...,
       [    0,     0,     0, ...,     0,     7,  6532],
       [    1,     0,     0, ...,     0,     7, 42188],
       [    0,     0,     0, ...,     0,     7,  9025]])

In [69]:
from sklearn.preprocessing import LabelEncoder

# Encode the target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [70]:
y_encoded

array([0, 0, 0, ..., 0, 2, 0])

In [71]:
from sklearn.model_selection import train_test_split

# Use `y_encoded` for splitting
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train the model
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
train_accuracy = clf.score(X_train, y_train)
test_accuracy = clf.score(X_test, y_test)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)


Training Accuracy: 0.8992985837817579
Testing Accuracy: 0.5497670932162553


In [61]:
def generate_label(pattern, full_word):
    """Generate the label: the next letter to guess."""
    masked_word = pattern.replace(" ", "")
    for idx, char in enumerate(full_word):
        if masked_word[idx] == '_':  # Return the first missing letter
            return char
    return None  # All letters guessed


In [62]:
labeled_data = []
for pattern, word in training_patterns:
    label = generate_label(pattern, word)
    if label:  # Ignore if no missing letters
        labeled_data.append((pattern, word, label))


In [63]:
labeled_data[:3]

[('a a _', 'aaa', 'a'), ('a _ a', 'aaa', 'a'), ('_ a a', 'aaa', 'a')]

In [64]:
final_data = []
for pattern, word, label in labeled_data:
    features = extract_features(pattern, short_corpus)
    final_data.append((features, label))

In [65]:
final_data

[({'visible_letters': 'aa', 'word_length': 3, 'letter_freq_sum': 33444}, 'a'),
 ({'visible_letters': 'aa', 'word_length': 3, 'letter_freq_sum': 33444}, 'a'),
 ({'visible_letters': 'aa', 'word_length': 3, 'letter_freq_sum': 33444}, 'a'),
 ({'visible_letters': 'a', 'word_length': 6, 'letter_freq_sum': 16722}, 'a'),
 ({'visible_letters': 'aaaaa', 'word_length': 6, 'letter_freq_sum': 83610},
  'a'),
 ({'visible_letters': 'a', 'word_length': 6, 'letter_freq_sum': 16722}, 'a'),
 ({'visible_letters': 'aaaaa', 'word_length': 6, 'letter_freq_sum': 83610},
  'a'),
 ({'visible_letters': 'a', 'word_length': 6, 'letter_freq_sum': 16722}, 'a'),
 ({'visible_letters': 'aaaa', 'word_length': 6, 'letter_freq_sum': 66888},
  'a'),
 ({'visible_letters': 'a', 'word_length': 4, 'letter_freq_sum': 16722}, 'a'),
 ({'visible_letters': 'a', 'word_length': 4, 'letter_freq_sum': 16722}, 'a'),
 ({'visible_letters': 'a', 'word_length': 4, 'letter_freq_sum': 16722}, 'a'),
 ({'visible_letters': 'a', 'word_length': 4,

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Convert features to a numeric format for training
X = [list(features.values()) for features, label in final_data]
y = [label for features, label in final_data]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluate model
print("Training Accuracy:", clf.score(X_train, y_train))
print("Testing Accuracy:", clf.score(X_test, y_test))


ValueError: could not convert string to float: 'goriticlly'

In [21]:
features

[{'visible_letters': 'a', 'word_length': 3, 'letter_freq_sum': 26},
 {'visible_letters': 'a', 'word_length': 3, 'letter_freq_sum': 26},
 {'visible_letters': 'aa', 'word_length': 3, 'letter_freq_sum': 52},
 {'visible_letters': 'a', 'word_length': 6, 'letter_freq_sum': 26},
 {'visible_letters': 'aaaaa', 'word_length': 6, 'letter_freq_sum': 130},
 {'visible_letters': 'aaaa', 'word_length': 6, 'letter_freq_sum': 104},
 {'visible_letters': 'aaaa', 'word_length': 6, 'letter_freq_sum': 104},
 {'visible_letters': 'a', 'word_length': 6, 'letter_freq_sum': 26},
 {'visible_letters': 'aaaa', 'word_length': 6, 'letter_freq_sum': 104},
 {'visible_letters': 'as', 'word_length': 4, 'letter_freq_sum': 29},
 {'visible_letters': 'aas', 'word_length': 4, 'letter_freq_sum': 55},
 {'visible_letters': 'aa', 'word_length': 4, 'letter_freq_sum': 52},
 {'visible_letters': 'as', 'word_length': 4, 'letter_freq_sum': 29},
 {'visible_letters': 'n', 'word_length': 6, 'letter_freq_sum': 2},
 {'visible_letters': 'e', 