In [70]:
import pandas as pd

# Load the dataset
df = pd.read_csv('train_data.csv')  # Replace 'your_dataset.csv' with your actual dataset file

df.columns = ['id']

df[['id', 'Name', 'Genre','Description']] = df['id'].str.split(':::', expand=True)

# Explore the dataset
print(df.head())
print(df.info())
print(df['Genre'].value_counts())

   id                                Name          Genre  \
0  2                        Cupid (1997)       thriller    
1  3    Young, Wild and Wonderful (1980)          adult    
2  4               The Secret Sin (1915)          drama    
3  5              The Unrecovered (2007)          drama    
4  6              Quality Control (2011)    documentary    

                                         Description  
0   A brother and sister with a past incestuous r...  
1   As the bus empties the students for their fie...  
2   To help their unemployed father make ends mee...  
3   The film's title refers not only to the un-re...  
4   Quality Control consists of a series of 16mm ...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12771 entries, 0 to 12770
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           12771 non-null  object
 1   Name         12771 non-null  object
 2   Genre        12771 non-null  obj

In [71]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

# Apply text preprocessing to the 'Description' column
df['Processed_Description'] = df['Description'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [72]:
df['Processed_Description']

0        brother sister past incestuous relationship cu...
1        bus empties students field trip museum natural...
2        help unemployed father make ends meet edith tw...
3        films title refers unrecovered bodies ground z...
4        quality control consists series 16mm single ta...
                               ...                        
12766    easy breathing tragicomic story seija form sho...
12767    enter world pandemonium reigns reckless ambiti...
12768    shepherd nomad tribe daughter chieftain fall l...
12769    documentary film soviet gulag 89 year old esto...
12770    impact dianas death focus shifted charles wido...
Name: Processed_Description, Length: 12771, dtype: object

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Processed_Description'])

# Convert TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [74]:
from sklearn.preprocessing import LabelEncoder

# Use label encoding for the 'Genre' column
label_encoder = LabelEncoder()
df['Genre_LabelEncoded'] = label_encoder.fit_transform(df['Genre'])

In [75]:
df['Genre_LabelEncoded']

0        24
1         1
2         8
3         8
4         7
         ..
12766    21
12767     7
12768     8
12769     7
12770     7
Name: Genre_LabelEncoded, Length: 12771, dtype: int64

In [76]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_df, df['Genre_LabelEncoded'], test_size=0.2, random_state=42)

In [77]:
X_train.head()

Unnamed: 0,10,20,abandoned,able,accident,across,act,action,actor,actors,...,writer,written,wrong,year,years,yet,york,young,younger,youth
1803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181705,0.0,0.0
3688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.113219,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142613,0.0,0.0
6399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
y_train.head()

1803    20
3688    24
8279    11
1642     8
6399     1
Name: Genre_LabelEncoded, dtype: int64

In [79]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [80]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.45
               precision    recall  f1-score   support

      action        0.00      0.00      0.00        64
       adult        0.00      0.00      0.00        34
   adventure        0.00      0.00      0.00        43
   animation        0.00      0.00      0.00        24
   biography        0.00      0.00      0.00         8
      comedy        0.37      0.26      0.31       370
       crime        0.00      0.00      0.00        19
 documentary        0.54      0.82      0.65       613
       drama        0.39      0.78      0.52       615
      family        0.00      0.00      0.00        31
     fantasy        0.00      0.00      0.00        23
   game-show        1.00      0.27      0.43        11
     history        0.00      0.00      0.00        10
      horror        0.47      0.08      0.14        98
       music        0.80      0.10      0.17        42
     musical        0.00      0.00      0.00        11
     mystery        0.00      0.00      0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
df2 = pd.read_csv('test_data.csv')

In [82]:
df2.columns = ['id']

df2[['id', 'Name','Description']] = df2['id'].str.split(':::', expand=True)

# Explore the dataset
print(df2.head())
print(df2.info())

   id                           Name  \
0  2       La guerra de papá (1977)    
1  3    Off the Beaten Track (2010)    
2  4         Meu Amigo Hindu (2015)    
3  5              Er nu zhai (1955)    
4  6             Riddle Room (2016)    

                                         Description  
0   Spain, March 1964: Quico is a very naughty ch...  
1   One year in the life of Albin and his family ...  
2   His father has died, he hasn't spoken with hi...  
3   Before he was known internationally as a mart...  
4   Emily Burns is being held captive in a room w...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8210 entries, 0 to 8209
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           8210 non-null   object
 1   Name         8210 non-null   object
 2   Description  8210 non-null   object
dtypes: object(3)
memory usage: 192.5+ KB
None


In [83]:
# Apply text preprocessing to the 'Description' column
df2['Processed_Description'] = df2['Description'].apply(preprocess_text)

In [84]:
# Assuming 'new_data' is a DataFrame with a 'Processed_Description' column
new_tfidf_matrix = tfidf_vectorizer.transform(df2['Processed_Description'])

# Make predictions on new data
new_predictions = model.predict(new_tfidf_matrix)

# Convert numerical predictions back to genre labels
predicted_genres = label_encoder.inverse_transform(new_predictions)
df2['Predicted_Genre'] = predicted_genres



In [85]:
df2['Predicted_Genre'].head()

0           drama 
1     documentary 
2           drama 
3           drama 
4           drama 
Name: Predicted_Genre, dtype: object