In [16]:
import pandas as pd

# Load the dataset from Excel
file_path = "the final dataset.xlsx"
df = pd.read_excel(file_path, sheet_name="Sheet2")

# Display the first few rows of the dataset for inspection
print(df.head())

# Check the columns in the dataset to verify data structure
print(df.columns)


                                          Phone Name  \
0  Wingtech Visible Midnight (WTVIS01) 32GB - Gra...   
1  Wingtech Visible Midnight (WTVIS01) 32GB - Gra...   
2  Wingtech Visible Midnight (WTVIS01) 32GB - Gra...   
3  Wingtech Visible Midnight (WTVIS01) 32GB - Gra...   
4  Wingtech Visible Midnight (WTVIS01) 32GB - Gra...   

                                            Comments  
0  Really impressed with the screen quality and s...  
1  Really impressed with the screen quality and s...  
2  Battery life is excellent, and performance is ...  
3          No issues so far, feels almost brand new!  
4  Battery life is excellent, and performance is ...  
Index(['Phone Name', 'Comments'], dtype='object')


In [18]:
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [20]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to analyze sentiment
def analyze_sentiment(review):
    scores = sid.polarity_scores(str(review))
    if scores['compound'] >= 0.05:
        return "Positive"
    elif scores['compound'] <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# Apply the sentiment analysis function
df['Sentiment'] = df['Comments'].apply(analyze_sentiment)

# Display a sample of classified data
print(df[['Comments', 'Sentiment']].head())


                                            Comments Sentiment
0  Really impressed with the screen quality and s...  Positive
1  Really impressed with the screen quality and s...  Positive
2  Battery life is excellent, and performance is ...  Positive
3          No issues so far, feels almost brand new!  Negative
4  Battery life is excellent, and performance is ...  Positive


In [22]:
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

# Set up stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

# Apply preprocessing to the 'Comments' column
df['Cleaned_Comments'] = df['Comments'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()
term_matrix = vectorizer.fit_transform(df['Cleaned_Comments'])

# Get term frequencies
terms = vectorizer.get_feature_names_out()
frequencies = term_matrix.sum(axis=0).tolist()[0]

# Create a DataFrame of term frequencies
term_freq_df = pd.DataFrame({'Term': terms, 'Frequency': frequencies})
term_freq_df = term_freq_df.sort_values(by='Frequency', ascending=False)
print(term_freq_df.head(20))


             Term  Frequency
3469        phone      10272
3794      quality       7750
2019         good       6008
404       battery       5836
4193       screen       5521
3439  performance       5157
2065        great       3718
2670         life       3012
3125          new       2980
668        camera       2972
5504        works       2927
3669        price       2874
1695          far       2790
3857       really       2750
1583    excellent       2723
251          apps       2682
563         brand       2669
2491       issues       2633
5281        value       2628
666          came       2618


In [None]:
# Define the attribute-term mapping
attributes = {
    'Performance': ['fast', 'ram', 'ssd', 'speed'],
    'Price': ['price', 'cost', 'value'],
    # Add more attributes and associated terms here
}

# Initialize attribute frequency dictionary
attribute_freq = {attr: 0 for attr in attributes}

# Sum the frequencies of terms under each attribute
for attr, terms in attributes.items():
    attribute_freq[attr] = term_freq_df[term_freq_df['Term'].isin(terms)]['Frequency'].sum()

# Convert to a DataFrame for visualization
attribute_freq_df = pd.DataFrame(list(attribute_freq.items()), columns=['Attribute', 'Frequency'])
print(attribute_freq_df)
