################################################################################
# 📌 PHASE 1 – PYTHON + WHISPER API SERVER SETUP
################################################################################
# Step 1: Install Python 3.10.10
# 👉 Download from: https://www.python.org/downloads/release/python-31010/
python --version
# Expected output: Python 3.10.10
# Step 2: Create & Activate Virtual Environment
python -m venv venv
.\venv\Scripts\activate
# If activation blocked in PowerShell:
Set-ExecutionPolicy RemoteSigned -Scope CurrentUser
# Step 3: Install Required Packages
python.exe -m pip install --upgrade pip
pip install fastapi uvicorn python-multipart
pip install git+https://github.com/openai/whisper.git
pip install torch
pip install faster-whisper
# Step 4: Create API Server File (whisper_api_server.py)
cat > whisper_api_server.py << 'EOF'
from fastapi import FastAPI, File, UploadFile
from faster_whisper import WhisperModel
from typing import Dict
import io
app = FastAPI()
model = WhisperModel("tiny", device="cpu")
@app.post("/transcribe")
async def transcribe(audio: UploadFile = File(...)) -> Dict[str, str]:
try:
audio_content = await audio.read()
audio_file_like = io.BytesIO(audio_content)
segments, _ = model.transcribe(audio_file_like)
transcription_text = ""
for segment in segments:
transcription_text += segment.text + " "
return {"transcription": transcription_text.strip()}
except Exception as e:
return {"error": str(e)}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
EOF
# Step 5: Run the Server
python whisper_api_server.py
# Server running at: http://0.0.0.0:8000
# Stop with CTRL+C
################################################################################
# 📌 PHASE 2 – UNITY PROJECT SETUP
################################################################################
# Folder structure (inside Assets/)
# ┣ Scripts/
# ┃ ┣ SpeechRecognitionTest.cs
# ┃ ┗ SavWav.cs
# ┣ Resources/
# ┃ ┗ VoiceCommandConfig.json
# ┣ Scenes/
# ┃ ┗ SampleScene.unity
# Step 1: Create VoiceCommandConfig.json
cat > Assets/Resources/VoiceCommandConfig.json << 'EOF'
[
{ "command": "MOVE", "trigger": "Move" },
{ "command": "STOP", "trigger": "Stop" },
{ "command": "RUN", "trigger": "Run" },
{ "command": "DANCE", "trigger": "Dance" },
{ "command": "JUMP", "trigger": "Jump" }
]
EOF
# Step 2: Create SpeechRecognitionTest.cs
cat > Assets/Scripts/SpeechRecognitionTest.cs << 'EOF'
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.Networking;
using TMPro;
using UnityEngine.UI;
using System.IO;
[System.Serializable]
public class VoiceCommand {
public string command;
public string trigger;
}
public class SpeechRecognitionTest : MonoBehaviour {
public Button startButton;
public Button stopButton;
public TMP_Text resultText;
public TMP_Dropdown micDropdown;
public Animator characterAnimator;
private AudioClip recording;
private string micDevice;
private string filePath;
void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
micDropdown.ClearOptions();
micDropdown.AddOptions(new List<string>(Microphone.devices));
}
void StartRecording() {
micDevice = micDropdown.options[micDropdown.value].text;
recording = Microphone.Start(micDevice, false, 10, 44100);
resultText.text = "Listening...";
}
void StopRecording() {
Microphone.End(micDevice);
SavWav.Save("recording", recording);
filePath = Path.Combine(Application.persistentDataPath, "recording.wav");
StartCoroutine(Upload(filePath));
}
IEnumerator Upload(string filePath) {
WWWForm form = new WWWForm();
byte[] fileData = File.ReadAllBytes(filePath);
form.AddBinaryData("audio", fileData, "recording.wav", "audio/wav");
using (UnityWebRequest www = UnityWebRequest.Post("http://127.0.0.1:8000/transcribe", form)) {
yield return www.SendWebRequest();
if (www.result != UnityWebRequest.Result.Success) {
resultText.text = "Error: " + www.error;
} else {
string json = www.downloadHandler.text;
resultText.text = json;
HandleCommand(json);
}
}
}
void HandleCommand(string json) {
string lower = json.ToLower();
if (lower.Contains("move")) characterAnimator.SetTrigger("Move");
if (lower.Contains("stop")) characterAnimator.SetTrigger("Stop");
if (lower.Contains("run")) characterAnimator.SetTrigger("Run");
if (lower.Contains("dance")) characterAnimator.SetTrigger("Dance");
if (lower.Contains("jump")) characterAnimator.SetTrigger("Jump");
}
}
EOF
# Step 3: Create SavWav.cs
cat > Assets/Scripts/SavWav.cs << 'EOF'
using System;
using System.IO;
using UnityEngine;
public static class SavWav {
const int HEADER_SIZE = 44;
public static bool Save(string filename, AudioClip clip) {
var filepath = Path.Combine(Application.persistentDataPath, filename + ".wav");
Directory.CreateDirectory(Path.GetDirectoryName(filepath));
using (var fileStream = CreateEmpty(filepath)) {
ConvertAndWrite(fileStream, clip);
WriteHeader(fileStream, clip);
}
return true;
}
static FileStream CreateEmpty(string filepath) {
var fileStream = new FileStream(filepath, FileMode.Create);
byte emptyByte = new byte();
for (int i = 0; i < HEADER_SIZE; i++) {
fileStream.WriteByte(emptyByte);
}
return fileStream;
}
static void ConvertAndWrite(FileStream fileStream, AudioClip clip) {
var samples = new float[clip.samples * clip.channels];
clip.GetData(samples, 0);
Int16[] intData = new Int16[samples.Length];
Byte[] bytesData = new Byte[samples.Length * 2];
const float rescaleFactor = 32767;
for (int i = 0; i < samples.Length; i++) {
intData[i] = (short)(samples[i] * rescaleFactor);
Byte[] byteArr = BitConverter.GetBytes(intData[i]);
byteArr.CopyTo(bytesData, i * 2);
}
fileStream.Write(bytesData, 0, bytesData.Length);
}
static void WriteHeader(FileStream fileStream, AudioClip clip) {
fileStream.Seek(0, SeekOrigin.Begin);
Byte[] riff = System.Text.Encoding.UTF8.GetBytes("RIFF");
fileStream.Write(riff, 0, 4);
Byte[] chunkSize = BitConverter.GetBytes(fileStream.Length - 8);
fileStream.Write(chunkSize, 0, 4);
Byte[] wave = System.Text.Encoding.UTF8.GetBytes("WAVE");
fileStream.Write(wave, 0, 4);
Byte[] fmt = System.Text.Encoding.UTF8.GetBytes("fmt ");
fileStream.Write(fmt, 0, 4);
Byte[] subChunk1 = BitConverter.GetBytes(16);
fileStream.Write(subChunk1, 0, 4);
UInt16 audioFormat = 1;
Byte[] audioFormatBytes = BitConverter.GetBytes(audioFormat);
fileStream.Write(audioFormatBytes, 0, 2);
UInt16 numChannels = (ushort)clip.channels;
fileStream.Write(BitConverter.GetBytes(numChannels), 0, 2);
Byte[] sampleRate = BitConverter.GetBytes(clip.frequency);
fileStream.Write(sampleRate, 0, 4);
Byte[] byteRate = BitConverter.GetBytes(clip.frequency * clip.channels * 2);
fileStream.Write(byteRate, 0, 4);
UInt16 blockAlign = (ushort)(clip.channels * 2);
fileStream.Write(BitConverter.GetBytes(blockAlign), 0, 2);
UInt16 bps = 16;
fileStream.Write(BitConverter.GetBytes(bps), 0, 2);
Byte[] datastring = System.Text.Encoding.UTF8.GetBytes("data");
fileStream.Write(datastring, 0, 4);
Byte[] subChunk2 = BitConverter.GetBytes(clip.samples * clip.channels * 2);
fileStream.Write(subChunk2, 0, 4);
}
}
EOF
################################################################################
# 📌 PHASE 3 – RUN & USE
################################################################################
# Step 1: Start Python API
python whisper_api_server.py
# Step 2: Run Unity scene
# - Select microphone in dropdown
# - Click Start → Speak: "Move", "Stop", "Run", "Dance", "Jump"
# - Click Stop → Character animates
# Step 3 (Optional): Use iVCam mobile mic
# - Install iVCam on phone + PC (https://www.e2esoft.com/ivcam/)
# - Connect both to same WiFi/hotspot
# - In Unity mic dropdown → choose "iVCam Mic"
-
Notifications
You must be signed in to change notification settings - Fork 0
Voice Command Project – Phase 1 installing Python 3.10.10 and Whisper model packages, running in localhost server 8000. Phase 2 Unity integration with Canvas (Start/Stop buttons, Dropdown, Text), mic input, JSON command config, character animation control (Move, Stop, Run, Dance, Jump) using voice commands.
abhishak26/Voice-Command-System-with-Python-Whisper-API-Unity-Integration
Folders and files
Name | Name | Last commit message | Last commit date | |
---|---|---|---|---|
Repository files navigation
About
Voice Command Project – Phase 1 installing Python 3.10.10 and Whisper model packages, running in localhost server 8000. Phase 2 Unity integration with Canvas (Start/Stop buttons, Dropdown, Text), mic input, JSON command config, character animation control (Move, Stop, Run, Dance, Jump) using voice commands.
Resources
Stars
Watchers
Forks
Releases
No releases published
Packages 0
No packages published