In [16]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder

# Download NLTK data
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('movies.csv')

# Inspect the dataset
print(df.head())

# Preprocess the text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

df['processed_description'] = df['movie_title'].apply(preprocess_text)

# Encode the genres as labels
label_encoder = LabelEncoder()
df['genre_encoded'] = label_encoder.fit_transform(df['genre'])

# Inspect the processed data
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                       movie_title release_date      genre mpaa_rating  \
0  Snow White and the Seven Dwarfs   1937-12-21    Musical           G   
1                        Pinocchio   1940-02-09  Adventure           G   
2                         Fantasia   1940-11-13    Musical           G   
3                Song of the South   1946-11-12  Adventure           G   
4                       Cinderella   1950-02-15      Drama           G   

   total_gross  inflation_adjusted_gross  
0    184925485                5228953251  
1     84300000                2188229052  
2     83320000                2187090808  
3     65000000                1078510579  
4     85000000                 920608730  
                       movie_title release_date      genre mpaa_rating  \
0  Snow White and the Seven Dwarfs   1937-12-21    Musical           G   
1                        Pinocchio   1940-02-09  Adventure           G   
2                         Fantasia   1940-11-13    Musical           G   
3

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the processed descriptions
X = tfidf_vectorizer.fit_transform(df['processed_description']).toarray()

# Extract the labels
y = df['genre_encoded']


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Extract the unique labels in y_test
unique_labels = sorted(set(y_test))

# Generate the classification report
target_names = [label_encoder.classes_[label] for label in unique_labels]
print(classification_report(y_test, y_pred, labels=unique_labels, target_names=target_names, zero_division=0))


Accuracy: 0.33


TypeError: object of type 'float' has no len()

In [20]:
def predict_genre(description):
    processed_description = preprocess_text(description)
    description_tfidf = tfidf_vectorizer.transform([processed_description]).toarray()
    predicted_genre_encoded = classifier.predict(description_tfidf)
    predicted_genre = label_encoder.inverse_transform(predicted_genre_encoded)
    return predicted_genre[0]

# Example usage
new_movie_description = "An adventurous young man sets out on a journey to find a hidden treasure."
predicted_genre = predict_genre(new_movie_description)
print(f'Predicted Genre: {predicted_genre}')


Predicted Genre: Adventure
