# Text Mining & Sentiment Analysis Project

This notebook demonstrates text mining and sentiment analysis, including:
- Text data preprocessing
- Feature extraction
- Sentiment analysis
- Visualization of results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette('Set2')

In [None]:
def generate_sample_data(n_samples=1000):
    positive_phrases = [
        "Great product, highly recommend!",
        "Excellent service and quality",
        "Very satisfied with my purchase",
        "Amazing experience overall"
    ]
    
    negative_phrases = [
        "Poor quality product",
        "Disappointed with the service",
        "Would not recommend",
        "Terrible experience"
    ]
    
    texts = []
    sentiments = []
    
    for _ in range(n_samples):
        if np.random.random() > 0.5:
            text = np.random.choice(positive_phrases)
            sentiment = 1
        else:
            text = np.random.choice(negative_phrases)
            sentiment = 0
            
        texts.append(text)
        sentiments.append(sentiment)
    
    return pd.DataFrame({
        'text': texts,
        'sentiment': sentiments
    })

# Generate sample data
df = generate_sample_data()
df.head()

In [None]:
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Apply preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)

In [None]:
# Feature extraction and model training
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(df['processed_text'])
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
# Visualizations
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

plt.figure(figsize=(8, 6))
df['sentiment'].value_counts().plot(kind='bar')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment (0: Negative, 1: Positive)')
plt.ylabel('Count')
plt.show()