In [1]:
from  datasets import load_dataset
import pandas as pd

dataset = load_dataset("stanfordnlp/imdb")
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])


X_train = train_df["text"]
y_train = train_df["label"]

X_test = test_df["text"]
y_test = test_df["label"]
train_df.head()



  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),
    stop_words="english"
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

logreg = LogisticRegression(max_iter=500)
logreg.fit(X_train_tfidf, y_train)

logreg_pred = logreg.predict(X_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print(classification_report(y_test, logreg_pred))



Logistic Regression Accuracy: 0.88264
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [3]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)

svm_pred = svm.predict(X_test_tfidf)

print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))


SVM Accuracy: 0.8702
              precision    recall  f1-score   support

           0       0.86      0.88      0.87     12500
           1       0.88      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



In [4]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

dataset_tokenized = dataset.map(
    tokenize,
    batched=True,
    batch_size=512
)

dataset_tokenized = dataset_tokenized.remove_columns(["text"])
dataset_tokenized = dataset_tokenized.rename_column("label", "labels")
dataset_tokenized.set_format("torch")



In [7]:
from transformers import DistilBertForSequenceClassification,AutoTokenizer,HFAutoTrainer,HFFineTuningConfig


# Model & tokenizer
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# CONFIG equivalent to TrainingArguments
config = HFFineTuningConfig(
    output_dir="./distilbert_imdb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100
)

# Trainer
trainer = HFAutoTrainer(
    model=model,
    config=config,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
    tokenizer=tokenizer
)

# Train
trainer.train()



ImportError: cannot import name 'HFAutoTrainer' from 'transformers' (C:\Users\Administrator\PycharmProjects\Nlp-final-project\.venv\Lib\site-packages\transformers\__init__.py)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./distilbert_imdb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"]
)

trainer.train()


In [None]:
predictions = trainer.predict(dataset_tokenized["test"])
distilbert_preds = predictions.predictions.argmax(-1)

from sklearn.metrics import accuracy_score, classification_report

print("DistilBERT Accuracy:", accuracy_score(y_test, distilbert_preds))
print(classification_report(y_test, distilbert_preds))


In [None]:
import requests
from dotenv import load_dotenv
import os


load_dotenv()  # Load variables from .env file
api_key = os.getenv("API_KEY")

movie_title = "Top Gun: Maverick"

# Search for movie ID
search_url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={movie_title}"
response = requests.get(search_url).json()
movie_id = response["results"][0]["id"]

# Get cast and crew
credits_url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}"
credits = requests.get(credits_url).json()

# Extract names
actors = [member["name"] for member in credits["cast"][:5]]
directors = [member["name"] for member in credits["crew"] if member["job"] == "Director"]

print("Actors:", actors)
print("Director:", directors)


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
sample_review = train_data[0]["text"]
doc = nlp(sample_review)

# Extract named entities
entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["PERSON"]]
print("Named Entities:", entities)

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(
    X_train.tolist(),
    truncation=True,
    padding=True,
    max_length=256
)

test_encodings = tokenizer(
    X_test.tolist(),
    truncation=True,
    padding=True,
    max_length=256
)